<a href="https://colab.research.google.com/github/JimKing100/Jestimate/blob/master/Main1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [192]:
!pip install category_encoders==2.0.0
!pip install fiona
!pip install geopy



In [0]:
# Import libraries
import pandas as pd
import numpy as np
import math

from datetime import datetime

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import statsmodels.api as sm

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from geopy.distance import vincenty as get_geodesic_distance

from bokeh.models import ColumnDataSource, TableColumn, DataTable
from bokeh.layouts import column
from bokeh.models.widgets import TextInput
from bokeh.plotting import curdoc

In [194]:
# Load SF real estate data - 10 years (2009-2018) of single family home sales in San Francisco downloaded from the SF MLS
# Longitude and latitude were added to the csv file prior to loading using geocoding.geo.census.gov 
df = pd.read_csv('https://raw.githubusercontent.com/JimKing100/Jestimate/master/data/SF-SFR-Sales-Final2d.csv')

# Rename subdistr_desc to neighborhood
df = df.rename(columns={'subdist_no': 'nid', 'subdist_desc': 'neighborhood'})

# Check the data
print(df.shape)
df.head(5)

(23711, 39)


Unnamed: 0,longitude,latitude,elevation,full_address,city,state,street_no,street_name,street_suffix,zip,area,district_no,district_desc,nid,neighborhood,on_market_date,cdom,orig_list_price,sale_date,sale_price,rooms,baths,beds,sf_source,sf_source_decs,sf,lot_acres,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views
0,-122.50965,37.78028,200.83,"2645 El Camino Del Mar, San Francisco, CA 94121",San Francisco,CA,2645,El Camino Del Mar,,94121,1050,1,SF District 1,1050,1 - Outer Richmond,3/14/13,4,1095000,3/22/13,1260000,8,3.5,4,T,Per Tax Records,2691,,0,1969,RH2,"RGLR,FNCD","PVDW,PVSW","ATCH,GARG",0,2,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN"
1,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",San Francisco,CA,1278,La Playa,St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,12/3/15,144,1250000,4/25/16,1075000,9,3.0,4,T,Per Tax Records,2437,0.0689,3000,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0
2,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",San Francisco,CA,1278,La Playa,St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,5/18/17,36,1395000,8/17/17,1525000,9,5.0,5,D,Per Architect,2597,0.0689,3000,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0
3,-122.50924,37.77733,189.11,"590 48th Ave, San Francisco, CA 94121",San Francisco,CA,590,48th,Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,6/27/10,42,725000,8/20/10,715000,5,1.0,2,T,Per Tax Records,1312,,0,1939,RH1,RGLR,"PVDW,PVSW",GARG,0,2,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK"
4,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121",San Francisco,CA,618,48th,Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,6/7/12,24,1595000,7/13/12,1595000,8,4.0,4,T,Per Tax Records,3307,,0,1951,,RGLR,PVDW,"ATCH,GARG,ATDR,INAC",0,3,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK"


In [195]:
# Baseline of sale_price
df['sale_price'].mean()

1371454.081312471

In [196]:
# Train, test split on date of 01/01/2018
df['sale_date'] = pd.to_datetime(df['sale_date'], infer_datetime_format=True)
df['year_sold'] = df['sale_date'].dt.year
df['month_sold'] = df['sale_date'].dt.month
df['day_sold'] = df['sale_date'].dt.day

df['on_market_date'] = pd.to_datetime(df['on_market_date'], infer_datetime_format=True)
df['year_on_market'] = df['on_market_date'].dt.year
df['month_on_market'] = df['on_market_date'].dt.month
df['day_on_market'] = df['on_market_date'].dt.day

low_cutoff = 2008
high_cutoff = 2018
train = df[(df['year_sold'] >= low_cutoff) & (df['year_sold'] < high_cutoff)]
test  = df[df['year_sold'] >= high_cutoff]
print(train.shape)
print(test.shape)

(21487, 45)
(2224, 45)


In [197]:
# Wrangle the data for train and test
def engineer_features(X):
  
  # Impute mean for null long/lat/elev based on mean of of neighborhood
  def feature_calc(feature, nid, f_dict):
    if math.isnan(feature):
      if (nid) in f_dict:
        new_feature = f_dict[nid]
        return new_feature
    else:
      return feature
    
    return feature
  
  temp = X[~X['longitude'].isna()].groupby(['nid'])['longitude'].mean()
  long_dict = dict(temp)
  X['longitude'] = X.apply(lambda x: feature_calc(x['longitude'], x['nid'], long_dict), axis=1)
  
  temp = X[~X['latitude'].isna()].groupby(['nid'])['latitude'].mean()
  lat_dict = dict(temp)
  X['latitude'] = X.apply(lambda x: feature_calc(x['latitude'], x['nid'], lat_dict), axis=1)
  
  temp = X[~X['elevation'].isna()].groupby(['nid'])['elevation'].mean()
  elev_dict = dict(temp)
  X['elevation'] = X.apply(lambda x: feature_calc(x['elevation'], x['nid'], elev_dict), axis=1)
  
  # Engineer a cluster feature based on long/lat clusters
  kmeans = KMeans(n_clusters=10, n_jobs=-1)
  X['cluster'] = kmeans.fit_predict(X[['longitude', 'latitude']])
  X['cluster'] = kmeans.predict(X[['longitude', 'latitude']])
  
  X['zip'] = X['zip'].astype(int)
  
  # Fill rooms zero values by adding beds and baths
  def room_calc(rooms_val, beds_val, baths_val):
    if rooms_val == 0:
      total = beds_val + baths_val
    else:
      total = rooms_val
      
    return total
  
  X['rooms'] = X.apply(lambda x: room_calc(x['rooms'], x['beds'], x['baths']), axis=1)
  
  # Fill baths zero values by adding beds and baths
  X.loc[(X['baths'] == 0), 'baths'] = 1
  
  # Fill house square foot zero values with the average house square footage by bedroom for all single family homes in SF
  averagesf_data = X.groupby('beds').sf.mean()

  # Use average sf by bedroom for each 0 value in each bedroom group up to 9 bedrooms
  for i in range(0, 9): 
    X.loc[(X['sf'] == 0) & (X['beds'] == i), 'sf'] = averagesf_data.loc[i]

  # Use 10,000sf for anything over 9 bedrooms
  X.loc[X['sf'] == 0, 'sf'] = 10000
  X = X.astype({'sf': int})
  
  # Fill lot_sf zero values by using lot_acres to calc
  def lotsf_calc(lotsf_val, lotacres_val):
    if lotsf_val == 0:
      total = lotacres_val * 43560
    else:
      total = lotsf_val
      
    return total
  
  X['lot_sf'] = X.apply(lambda x: lotsf_calc(x['lot_sf'], x['lot_acres']), axis=1)
  
  # Fill lot_acres zero values by using lot_sf to calc
  def lotacres_calc(lotacres_val, lotsf_val):
    if lotacres_val == 0:
      total = lotsf_val / 43560
    else:
      total = lotacres_val
      
    return total
  
  X['lot_sf'] = X.apply(lambda x: lotsf_calc(x['lot_sf'], x['lot_acres']), axis=1)
  X['lot_acres'] = X.apply(lambda x: lotacres_calc(x['lot_acres'], x['lot_sf']), axis=1)
  
  # Features with nan's that should be zero's
  cols_with_nans = ['lot_sf', 'lot_acres', 'year_built', 'zoning']
  for col in cols_with_nans:
    X[col] = X[col].replace(np.nan, 0)
    
  # Features with numeric zero's that should be nan's
  #cols_with_czeros = ['lot_desc', 'drive_side', 'parking', 'park_leased',
  #                    'shopping', 'transportation', 'type', 'views']
  #for col in cols_with_czeros:
  #  X[col] = X[col].replace('0', np.nan)
    
  # Engineer a price_sf column
  X['price_sf'] = X['sale_price'] / X['sf']
  
  # Drop unneeded columns
  unneeded_columns = ['sale_date', 'on_market_date', 'city', 'state', 'street_no', 'street_name']
  X = X.drop(columns=unneeded_columns)
  
  return X

train = engineer_features(train)
test = engineer_features(test)

train.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

Unnamed: 0,longitude,latitude,elevation,full_address,street_suffix,zip,area,district_no,district_desc,nid,neighborhood,cdom,orig_list_price,sale_price,rooms,baths,beds,sf_source,sf_source_decs,sf,lot_acres,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold,month_sold,day_sold,year_on_market,month_on_market,day_on_market,cluster,price_sf
0,-122.50965,37.78028,200.83,"2645 El Camino Del Mar, San Francisco, CA 94121",,94121,1050,1,SF District 1,1050,1 - Outer Richmond,4,1095000,1260000,8.0,3.5,4,T,Per Tax Records,2691,0.0,0.0,1969,RH2,"RGLR,FNCD","PVDW,PVSW","ATCH,GARG",0,2,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN",2013,3,22,2013,3,14,7,468.227425
1,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,144,1250000,1075000,9.0,3.0,4,T,Per Tax Records,2437,0.0689,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2016,4,25,2015,12,3,0,441.116126
2,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,36,1395000,1525000,9.0,5.0,5,D,Per Architect,2597,0.0689,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2017,8,17,2017,5,18,0,587.216018
3,-122.50924,37.77733,189.11,"590 48th Ave, San Francisco, CA 94121",Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,42,725000,715000,5.0,1.0,2,T,Per Tax Records,1312,0.0,0.0,1939,RH1,RGLR,"PVDW,PVSW",GARG,0,2,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK",2010,8,20,2010,6,27,7,544.969512
4,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121",Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,24,1595000,1595000,8.0,4.0,4,T,Per Tax Records,3307,0.0,0.0,1951,0,RGLR,PVDW,"ATCH,GARG,ATDR,INAC",0,3,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK",2012,7,13,2012,6,7,7,482.310251


In [198]:
# Train
cutoff = 2017
temp=train.copy()
train = temp[temp['year_sold'] < 2017]
val  = temp[temp['year_sold'] >= 2017]
print(train.shape, val.shape, test.shape)

(19226, 41) (2261, 41) (2224, 41)


In [199]:
# Encode and fit a linear regression model

target = 'sale_price'

features = train.columns.drop(target)
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

pipeline = make_pipeline(
      ce.OrdinalEncoder(),
      LinearRegression()
  )

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)

# Print regression metrics
val_mse = mean_squared_error(y_val, y_pred)
val_rmse = np.sqrt(val_mse)
val_mae = mean_absolute_error(y_val, y_pred)
val_r2 = r2_score(y_val, y_pred)
print('Validation Mean Absolute Error:', val_mae)
print('Validation R^2:', val_r2)
print ('Validation Accuracy', pipeline.score(X_val, y_val))
print('\n')

ty_pred = pipeline.predict(X_test)

# Print regression metrics
test_mse = mean_squared_error(y_test, ty_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, ty_pred)
test_r2 = r2_score(y_test, ty_pred)
print('Test Mean Absolute Error:', test_mae)
print('Test R^2:', test_r2)
print ('Test Accuracy', pipeline.score(X_test, y_test))
print('\n')


X_train.head()

Validation Mean Absolute Error: 217918.0797167992
Validation R^2: 0.8976492447759028
Validation Accuracy 0.8976492447759029


Test Mean Absolute Error: 262372.7532357404
Test R^2: 0.8097878211504056
Test Accuracy 0.8097878211504056




Unnamed: 0,longitude,latitude,elevation,full_address,street_suffix,zip,area,district_no,district_desc,nid,neighborhood,cdom,orig_list_price,rooms,baths,beds,sf_source,sf_source_decs,sf,lot_acres,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold,month_sold,day_sold,year_on_market,month_on_market,day_on_market,cluster,price_sf
0,-122.50965,37.78028,200.83,"2645 El Camino Del Mar, San Francisco, CA 94121",,94121,1050,1,SF District 1,1050,1 - Outer Richmond,4,1095000,8.0,3.5,4,T,Per Tax Records,2691,0.0,0.0,1969,RH2,"RGLR,FNCD","PVDW,PVSW","ATCH,GARG",0,2,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN",2013,3,22,2013,3,14,7,468.227425
1,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,144,1250000,9.0,3.0,4,T,Per Tax Records,2437,0.0689,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2016,4,25,2015,12,3,0,441.116126
3,-122.50924,37.77733,189.11,"590 48th Ave, San Francisco, CA 94121",Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,42,725000,5.0,1.0,2,T,Per Tax Records,1312,0.0,0.0,1939,RH1,RGLR,"PVDW,PVSW",GARG,0,2,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK",2010,8,20,2010,6,27,7,544.969512
4,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121",Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,24,1595000,8.0,4.0,4,T,Per Tax Records,3307,0.0,0.0,1951,0,RGLR,PVDW,"ATCH,GARG,ATDR,INAC",0,3,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK",2012,7,13,2012,6,7,7,482.310251
6,-122.509186,37.761005,23.69,"1362 La Playa St, San Francisco, CA 94122-1019",St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,76,600000,4.0,1.0,3,T,Per Tax Records,1168,0.0,0.0,1947,0,"RGLR,LEVL",PVDW,"ATCH,GARG,ATDR",0,2,0,1BLK,ATAC,0,2009,5,19,2009,2,18,0,453.767123


In [0]:
#pipeline.named_steps

In [0]:
# pd.set_option('display.max_rows', 200)
# model = pipeline.named_steps['linearregression']
# encoder = pipeline.named_steps['ordinalencoder']
# encoded_columns = encoder.transform(X_train).columns 
# coef = pd.Series(model.coef_, encoded_columns)
# coef.sort_values(ascending=False)

# functional_coefficents = pd.Series(
#     coef,
#     encoded_columns
# )

# plt.figure(figsize=(10,10))
# functional_coefficents.sort_values().plot.barh();

In [0]:
pipeline.fit(X_test, y_test)
y_pred = pipeline.predict(X_test)

In [203]:
final = test.copy()
final['prediction'] = y_pred
final['difference'] = final['sale_price'] - final['prediction']
final['prediction'] = final['prediction'].astype(int)
final['difference'] = final['difference'].astype(int)
final.head()

Unnamed: 0,longitude,latitude,elevation,full_address,street_suffix,zip,area,district_no,district_desc,nid,neighborhood,cdom,orig_list_price,sale_price,rooms,baths,beds,sf_source,sf_source_decs,sf,lot_acres,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold,month_sold,day_sold,year_on_market,month_on_market,day_on_market,cluster,price_sf,prediction,difference
5,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121-2410",Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,13,2749000,3310000,8.0,4.0,4,S,Per Graphic Artist,3735,0.0909,3959.0,1951,0,0,0,"ATCH,GARG,ATDR,INAC",ONST,3,3BLK,1BLK,2STR,"PNRM,CTYL,WATR,SFRN,OCEN,PARK,GRDN,TWNP",2018,6,21,2018,5,17,1,886.211513,3741981,-431981
7,-122.50909,37.75958,23.45,"1434 La Playa St, San Francisco, CA 94122",St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,131,1595000,1304000,5.0,2.0,3,U,Not Available,1520,0.0606,2639.0,1900,RM1,"RGLR,LEVL",PVDW,UNCV,ONST,1,1BLK,1BLK,"SMAT,3STR","PNRM,PRTL,PARK,GRDN,HILL",2018,11,28,2018,7,6,9,857.894737,1245529,58470
9,-122.509056,37.75801,23.01,"1516 Great Highway, San Francisco, CA 94122",,94122,2030,2,SF District 2,2030,2 - Outer Sunset,15,1298000,1830000,7.0,3.0,4,T,Per Tax Records,2120,0.0705,3072.0,1948,RM1,"RGLR,OCNF",0,GARG,ONST,1,2BLK,2BLK,FULL,GRDN,2018,5,23,2018,4,30,9,863.207547,1773181,56818
15,-122.50904,37.774418,47.1,"739 48th Ave, San Francisco, CA 94121-3209",Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,20,1295000,1650000,6.0,2.0,3,U,Not Available,1520,0.0689,3000.0,1957,RH2,0,0,"ATCH,ATDR,INAC",ONST,1,0,0,0,0,2018,3,23,2018,2,15,1,1085.526316,1535619,114380
17,-122.50902,37.758636,22.7,"1492 La Playa St, San Francisco, CA 94122",St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,19,895000,1300000,3.0,1.0,2,A,Per Appraiser,1087,0.0286,1245.0,1944,RM1,0,"PVDW,PVSW",ATCH,ONST,1,1BLK,1BLK,0,0,2018,7,18,2018,6,14,9,1195.952162,1471109,-171109


In [204]:
final.shape

(2224, 43)

In [205]:
data = final[['full_address', 'neighborhood', 'nid', 'sale_price', 'prediction', 'difference']]
data = data.drop_duplicates()

data.head()

Unnamed: 0,full_address,neighborhood,nid,sale_price,prediction,difference
5,"618 48th Ave, San Francisco, CA 94121-2410",1 - Outer Richmond,1050,3310000,3741981,-431981
7,"1434 La Playa St, San Francisco, CA 94122",2 - Outer Sunset,2030,1304000,1245529,58470
9,"1516 Great Highway, San Francisco, CA 94122",2 - Outer Sunset,2030,1830000,1773181,56818
15,"739 48th Ave, San Francisco, CA 94121-3209",1 - Outer Richmond,1050,1650000,1535619,114380
17,"1492 La Playa St, San Francisco, CA 94122",2 - Outer Sunset,2030,1300000,1471109,-171109


In [206]:
data.shape

(2224, 6)

In [0]:
data.to_csv('/content/display_data.csv')