### After several days of getting my scraper to a reasonable level, now it's time to build a simple model that describes the data 

### Current
since I lack time-series data for each location this will simply use know information as well as maybe some exogeneous information to predict prices.

- determine what predictors are releavent on a national level to estimate prices
- determine what predictors are releavent on a local level to restimate prices

### FUTURE
forecast prices

- locate additional sources of information to be used for backtests
- seasonality model?



In [34]:
### import the relevant libraries ###
import pandas as pd
import numpy as np
from datetime import datetime as dt
pd.options.display.float_format = "{:.0f}".format

import re

#### sklearn ###
from sklearn.preprocessing import StandardScaler
retail_scaler = StandardScaler()

# for making a Custom Scaler
from sklearn.base import BaseEstimator, TransformerMixin

# import the LogReg model from sklearn
from sklearn.linear_model import LogisticRegression

# import the 'metrics' module, which includes important metrics we may want to use
from sklearn import metrics

In [None]:
# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [23]:
unfiltered_data = pd.read_csv("DATA/scrape_files/Master.csv")
unfiltered_data.shape


(34959, 15)

In [25]:
unfiltered_data[unfiltered_data.sqft.str.contains("-") | unfiltered_data.sqft.str.contains("NaN")].count()


Unnamed: 0     2147
name           2147
address        2147
unit           2147
sqft           2147
bed            2147
bath           2147
price          2147
city           2147
state          2147
zipcode        2147
description    2147
details        2147
url            2147
date           2147
dtype: int64

In [27]:
unfiltered_data[~unfiltered_data.sqft.str.contains("-", na=False)].count()

Unnamed: 0     32812
name           32812
address        32812
unit           32070
sqft           32066
bed            32812
bath           32812
price          32812
city           32812
state          32812
zipcode        32812
description    32812
details        32812
url            32812
date           32812
dtype: int64

In [76]:
# remove the 2k+ appartment summary listsings, which (I believe moves the data where the column was shifted)
uf_data = unfiltered_data[~unfiltered_data.sqft.str.contains("-", na=False)]
uf_data = uf_data[~uf_data.unit.str.contains("Bed", na=False)]
uf_data = uf_data[uf_data.unit != 'home'] # get rid of homes/condos for rent (for now)

In [88]:
uf_data.sqft.replace(regex=True, to_replace=r'\D', value='', inplace=True)
uf_data.sqft.fillna(0, inplace=True)

uf_data.unit.replace(regex=True, to_replace=r'\D', value='', inplace=True)
uf_data.bed.replace(regex=True, to_replace=r'\D', value='', inplace=True)
uf_data.bath.replace(regex=True, to_replace=r'\D', value='', inplace=True)
uf_data.price.replace(regex=True, to_replace=r'\D', value='', inplace=True)

In [86]:
uf_data.price.replace(regex=True, to_replace=r'\D', value='', inplace=True)

In [241]:
def detail_string_to_dict(detail):
    """ something """
    my_dict = {}
    for d in detail.split(','):
        if ":" in d:
            it = iter([(d.split(":")[0]).strip(), (d.split(":")[1]).strip()])
            mydict = dict(zip(it, it))
            my_dict.update(mydict)

        else:
            l = (d.strip()+":True").split(":")
            it = iter(l)
            myd = dict(zip(it, it))
            # print(myd)
            my_dict.update(myd)
    return my_dict

In [247]:
uf_data['details'] = uf_data.details.apply(detail_string_to_dict)

In [249]:
uf_data.head()

Unnamed: 0.1,Unnamed: 0,name,address,unit,sqft,bed,bath,price,city,state,zipcode,description,details,url,date
1,1,Avanti Apartments,10697 W Centennial Pkwy,2140,753,1,1,1127,Las Vegas,NV,89166,This apartment is located at 10697 W Centennia...,"{'Heating': 'Other', 'Days on Market': '41 Day...",https://www.trulia.com/c/nv/las-vegas/avanti-a...,11/17/2020
2,2,Avanti Apartments,10697 W Centennial Pkwy,1119,771,1,1,1124,Las Vegas,NV,89166,This apartment is located at 10697 W Centennia...,"{'Heating': 'Other', 'Days on Market': '41 Day...",https://www.trulia.com/c/nv/las-vegas/avanti-a...,11/17/2020
3,3,Avanti Apartments,10697 W Centennial Pkwy,1078,849,1,1,1335,Las Vegas,NV,89166,This apartment is located at 10697 W Centennia...,"{'Heating': 'Other', 'Days on Market': '41 Day...",https://www.trulia.com/c/nv/las-vegas/avanti-a...,11/17/2020
4,4,Avanti Apartments,10697 W Centennial Pkwy,1092,820,1,1,1347,Las Vegas,NV,89166,This apartment is located at 10697 W Centennia...,"{'Heating': 'Other', 'Days on Market': '41 Day...",https://www.trulia.com/c/nv/las-vegas/avanti-a...,11/17/2020
5,5,Avanti Apartments,10697 W Centennial Pkwy,3030,801,1,1,1123,Las Vegas,NV,89166,This apartment is located at 10697 W Centennia...,"{'Heating': 'Other', 'Days on Market': '41 Day...",https://www.trulia.com/c/nv/las-vegas/avanti-a...,11/17/2020
