<a href="https://colab.research.google.com/github/JimKing100/Jestimate/blob/master/linear_regression_comps/LR_Comps4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [713]:
!pip install category_encoders==2.0.0
!pip install fiona
!pip install geopy



In [0]:
# Import libraries
import pandas as pd
import numpy as np
import math

import pandas_profiling

from datetime import datetime

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report

import statsmodels.api as sm

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from geopy.distance import vincenty as get_geodesic_distance

from bokeh.models import ColumnDataSource, TableColumn, DataTable
from bokeh.layouts import column
from bokeh.models.widgets import TextInput
from bokeh.plotting import curdoc

In [715]:
# Load SF real estate data - 10 years (2009-2018) of single family home sales in San Francisco downloaded from the SF MLS
# Longitude and latitude were added to the csv file prior to loading using geocoding.geo.census.gov 
df = pd.read_csv('https://raw.githubusercontent.com/JimKing100/Jestimate/master/data/SF-SFR-Sales-Final2d.csv')

# Rename subdistr_desc to neighborhood
df = df.rename(columns={'subdist_no': 'nid', 'subdist_desc': 'neighborhood'})

# Create subset with outliers removed - 1.6%of the data
mask = (
  (df['baths'] < 6) &
  (df['beds'] < 7) &
  (df['beds'] > 0) &
  (df['lot_sf'] < 10000) &
  (df['rooms'] < 13) &
  (df['sale_price'] < 10000000) &
  (df['sf'] < 10000) &
  (df['sf'] > 100)
)
df = df[mask]

# Check the data
print(df.shape)
df.head(5)

(19497, 39)


Unnamed: 0,longitude,latitude,elevation,full_address,city,state,street_no,street_name,street_suffix,zip,area,district_no,district_desc,nid,neighborhood,on_market_date,cdom,orig_list_price,sale_date,sale_price,rooms,baths,beds,sf_source,sf_source_decs,sf,lot_acres,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views
0,-122.50965,37.78028,200.83,"2645 El Camino Del Mar, San Francisco, CA 94121",San Francisco,CA,2645,El Camino Del Mar,,94121,1050,1,SF District 1,1050,1 - Outer Richmond,3/14/13,4,1095000,3/22/13,1260000,8,3.5,4,T,Per Tax Records,2691,,0,1969,RH2,"RGLR,FNCD","PVDW,PVSW","ATCH,GARG",0,2,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN"
1,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",San Francisco,CA,1278,La Playa,St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,12/3/15,144,1250000,4/25/16,1075000,9,3.0,4,T,Per Tax Records,2437,0.0689,3000,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0
2,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",San Francisco,CA,1278,La Playa,St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,5/18/17,36,1395000,8/17/17,1525000,9,5.0,5,D,Per Architect,2597,0.0689,3000,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0
3,-122.50924,37.77733,189.11,"590 48th Ave, San Francisco, CA 94121",San Francisco,CA,590,48th,Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,6/27/10,42,725000,8/20/10,715000,5,1.0,2,T,Per Tax Records,1312,,0,1939,RH1,RGLR,"PVDW,PVSW",GARG,0,2,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK"
4,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121",San Francisco,CA,618,48th,Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,6/7/12,24,1595000,7/13/12,1595000,8,4.0,4,T,Per Tax Records,3307,,0,1951,,RGLR,PVDW,"ATCH,GARG,ATDR,INAC",0,3,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK"


In [0]:
#pandas_profiling.ProfileReport(df)

In [717]:
# Train, test split on date of 01/01/2018
df['sale_date'] = pd.to_datetime(df['sale_date'], infer_datetime_format=True)
df['year_sold'] = df['sale_date'].dt.year
df['month_sold'] = df['sale_date'].dt.month
df['day_sold'] = df['sale_date'].dt.day

df['on_market_date'] = pd.to_datetime(df['on_market_date'], infer_datetime_format=True)
df['year_on_market'] = df['on_market_date'].dt.year
df['month_on_market'] = df['on_market_date'].dt.month
df['day_on_market'] = df['on_market_date'].dt.day

low_cutoff = 2008
high_cutoff = 2018
train = df[(df['year_sold'] >= low_cutoff) & (df['year_sold'] < high_cutoff)]
test  = df[df['year_sold'] >= high_cutoff]
print(train.shape)
print(test.shape)

(17618, 45)
(1879, 45)


In [718]:
# Wrangle the data for train and test
def engineer_features(X):
  
  # Fill house square foot zero values with the average house square footage by bedroom for all single family homes in SF
  averagesf_data = X.groupby('beds').sf.mean()

  # Use average sf by bedroom for each 0 value in each bedroom group 1 - 6 bedrooms
  for i in range(1, 7): 
    X.loc[(X['sf'] == 0) & (X['beds'] == i), 'sf'] = averagesf_data.loc[i]
  
  # Impute mean for null long/lat/elev based on mean of of neighborhood
  def feature_calc(feature, nid, f_dict):
    if math.isnan(feature):
      if (nid) in f_dict:
        new_feature = f_dict[nid]
        return new_feature
    else:
      return feature
    
    return feature
  
  temp = X[~X['longitude'].isna()].groupby(['nid'])['longitude'].mean()
  long_dict = dict(temp)
  X['longitude'] = X.apply(lambda x: feature_calc(x['longitude'], x['nid'], long_dict), axis=1)
  
  temp = X[~X['latitude'].isna()].groupby(['nid'])['latitude'].mean()
  lat_dict = dict(temp)
  X['latitude'] = X.apply(lambda x: feature_calc(x['latitude'], x['nid'], lat_dict), axis=1)
  
  temp = X[~X['elevation'].isna()].groupby(['nid'])['elevation'].mean()
  elev_dict = dict(temp)
  X['elevation'] = X.apply(lambda x: feature_calc(x['elevation'], x['nid'], elev_dict), axis=1)
  
  X['zip'] = X['zip'].astype(int)
  
  # Fill rooms zero values by adding beds and baths
  def room_calc(rooms_val, beds_val, baths_val):
    if rooms_val == 0:
      total = beds_val + baths_val
    else:
      total = rooms_val
      
    return total
  
  X['rooms'] = X.apply(lambda x: room_calc(x['rooms'], x['beds'], x['baths']), axis=1)
  
  # Fill baths zero values by adding beds and baths
  X.loc[(X['baths'] == 0), 'baths'] = 1
  
  # Fill lot_sf zero values by using lot_acres to calc
  def lotsf_calc(lotsf_val, lotacres_val):
    if lotsf_val == 0:
      total = lotacres_val * 43560
    else:
      total = lotsf_val
      
    return total
  
  X['lot_sf'] = X.apply(lambda x: lotsf_calc(x['lot_sf'], x['lot_acres']), axis=1)
  
  # Fill lot_acres zero values by using lot_sf to calc
  def lotacres_calc(lotacres_val, lotsf_val):
    if lotacres_val == 0:
      total = lotsf_val / 43560
    else:
      total = lotacres_val
      
    return total
  
  X['lot_acres'] = X.apply(lambda x: lotacres_calc(x['lot_acres'], x['lot_sf']), axis=1)
  
  X['ds_count'] = X.apply(lambda x: (x['drive_side'].count(',') + 1), axis=1)
  X['parking_count'] = X.apply(lambda x: (x['parking'].count(',') + 1), axis=1)
  X['view_count'] = X.apply(lambda x: (x['views'].count(',') + 1), axis=1)
  
  # Engineer new feature mean_neighbor_price THIS TAKES 1.5 HOURS TO RUN
  # Output was saved to nhoods files and is loaded later to save time
  nhoods = X[['sf', 'longitude', 'latitude']]
  
  def neighbor_mean(sqft, source_latitude, source_longitude):
    
    source_latlong = source_latitude, source_longitude
    source_table = X[(X['sf'] >= (sqft * .9)) & (X['sf'] <= (sqft * 1.1))]
    target_table = pd.DataFrame(source_table, columns = ['latitude', 'longitude', 'year_sold', 'sale_price']) 

    def get_distance(row):
        target_latlong = row['latitude'], row['longitude']
        return get_geodesic_distance(target_latlong, source_latlong).meters

    target_table['distance'] = target_table.apply(get_distance, axis=1)

    # Get the nearest 3 locations
    nearest_target_table = target_table.sort_values(['year_sold', 'distance'], ascending=[False, True])[:3]

    return nearest_target_table['sale_price'].mean()/sqft

  #nhoods['mean_hood_ppsf'] = X.apply(lambda x: neighbor_mean(x['sf'], x['latitude'], x['longitude']), axis=1)
  #nhoods.to_csv('/content/nhoods.csv')
  
#   # Features with nan's that should be zero's
#   cols_with_nans = ['lot_sf', 'lot_acres', 'year_built', 'zoning']
#   for col in cols_with_nans:
#     X[col] = X[col].replace(np.nan, 0)
    
  # Engineer a price_sf column
  #X['price_sf'] = X['sale_price'] / X['sf']
  
  # Drop unneeded columns
  unneeded_columns = ['sale_date', 'on_market_date', 'city', 'state', 'street_no', 'street_name', 'street_suffix',
                      'day_on_market', 'month_on_market', 'year_on_market', 'month_sold', 'day_sold', 'orig_list_price', 'cdom',
                      'sf_source', 'area', 'sf_source_decs', 'lot_acres', 'views', 'parking', 'full_address', 'zoning']
  X = X.drop(columns=unneeded_columns)
  
  return X

train = engineer_features(train)
test = engineer_features(test)

train.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

Unnamed: 0,longitude,latitude,elevation,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,lot_desc,drive_side,park_leased,num_parking,shopping,transportation,type,year_sold,ds_count,parking_count,view_count
0,-122.50965,37.78028,200.83,94121,1,SF District 1,1050,1 - Outer Richmond,1260000,8.0,3.5,4,2691.0,,1969,"RGLR,FNCD","PVDW,PVSW",0,2,4BLK,1BLK,3STR,2013,2,2,4
1,-122.50929,37.762608,23.21,94122,2,SF District 2,2030,2 - Outer Sunset,1075000,9.0,3.0,4,2437.0,3000.0,1947,RGLR,0,ONST,1,2BLK,1BLK,0,2016,1,4,1
2,-122.50929,37.762608,23.21,94122,2,SF District 2,2030,2 - Outer Sunset,1525000,9.0,5.0,5,2597.0,3000.0,1947,RGLR,0,ONST,1,2BLK,1BLK,0,2017,1,4,1
3,-122.50924,37.77733,189.11,94121,1,SF District 1,1050,1 - Outer Richmond,715000,5.0,1.0,2,1312.0,,1939,RGLR,"PVDW,PVSW",0,2,3BLK,1BLK,"ATAC,2STR,FIXR",2010,2,1,3
4,-122.50919,37.776695,175.89,94121,1,SF District 1,1050,1 - Outer Richmond,1595000,8.0,4.0,4,3307.0,,1951,RGLR,PVDW,0,3,4BLK,2BLK,3STR,2012,1,4,4


In [0]:
rnhoods = pd.read_csv('https://raw.githubusercontent.com/JimKing100/Jestimate/master/data/nhoods-train.csv')
tnhoods = pd.read_csv('https://raw.githubusercontent.com/JimKing100/Jestimate/master/data/nhoods-test.csv')
train = pd.merge(train, rnhoods[['old_index', 'mean_hood_ppsf']], left_index=True, right_on='old_index')
test = pd.merge(test, tnhoods[['old_index', 'mean_hood_ppsf']], left_index=True, right_on='old_index')
train = train.drop(columns=['old_index'])
test = test.drop(columns=['old_index'])
train = train.rename(columns={'mean_hood_ppsf': 'comp_price_sf'})
test = test.rename(columns={'mean_hood_ppsf': 'comp_price_sf'})

In [720]:
print(train.shape)
train.head()

(17618, 27)


Unnamed: 0,longitude,latitude,elevation,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,lot_desc,drive_side,park_leased,num_parking,shopping,transportation,type,year_sold,ds_count,parking_count,view_count,comp_price_sf
0,-122.50965,37.78028,200.83,94121,1,SF District 1,1050,1 - Outer Richmond,1260000,8.0,3.5,4,2691.0,,1969,"RGLR,FNCD","PVDW,PVSW",0,2,4BLK,1BLK,3STR,2013,2,2,4,682.521987
1,-122.50929,37.762608,23.21,94122,2,SF District 2,2030,2 - Outer Sunset,1075000,9.0,3.0,4,2437.0,3000.0,1947,RGLR,0,ONST,1,2BLK,1BLK,0,2016,1,4,1,675.030776
2,-122.50929,37.762608,23.21,94122,2,SF District 2,2030,2 - Outer Sunset,1525000,9.0,5.0,5,2597.0,3000.0,1947,RGLR,0,ONST,1,2BLK,1BLK,0,2017,1,4,1,606.488256
3,-122.50924,37.77733,189.11,94121,1,SF District 1,1050,1 - Outer Richmond,715000,5.0,1.0,2,1312.0,,1939,RGLR,"PVDW,PVSW",0,2,3BLK,1BLK,"ATAC,2STR,FIXR",2010,2,1,3,978.150406
4,-122.50919,37.776695,175.89,94121,1,SF District 1,1050,1 - Outer Richmond,1595000,8.0,4.0,4,3307.0,,1951,RGLR,PVDW,0,3,4BLK,2BLK,3STR,2012,1,4,4,869.972785


In [721]:
print(test.shape)
test.head()

(1879, 27)


Unnamed: 0,longitude,latitude,elevation,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,lot_desc,drive_side,park_leased,num_parking,shopping,transportation,type,year_sold,ds_count,parking_count,view_count,comp_price_sf
0,-122.50919,37.776695,175.89,94121,1,SF District 1,1050,1 - Outer Richmond,3310000,8.0,4.0,4,3735.0,3959.0,1951,0,0,ONST,3,3BLK,1BLK,2STR,2018,1,4,8,887.99643
2,-122.509056,37.75801,23.01,94122,2,SF District 2,2030,2 - Outer Sunset,1830000,7.0,3.0,4,2120.0,3072.0,1948,"RGLR,OCNF",0,ONST,1,2BLK,2BLK,FULL,2018,1,1,1,796.383648
4,-122.50902,37.758636,22.7,94122,2,SF District 2,2030,2 - Outer Sunset,1300000,3.0,1.0,2,1087.0,1245.0,1944,0,"PVDW,PVSW",ONST,1,1BLK,1BLK,0,2018,2,1,1,1232.75069
5,-122.50898,37.77368,40.08,94121,1,SF District 1,1050,1 - Outer Richmond,1300000,6.0,1.5,2,1326.0,2696.0,1942,"RGLR,LEVL,FNCD","PVDW,PVSW",ONST,2,1BLK,1BLK,"JR,2STR",2018,2,1,1,1045.751634
7,-122.50827,37.778465,225.66,94121,1,SF District 1,1050,1 - Outer Richmond,1865000,6.0,2.0,3,1800.0,3000.0,1931,RGLR,"PVDW,PVSW",ONST,2,0,1BLK,"FULL,2STR",2018,2,4,1,904.62963


In [0]:
#pandas_profiling.ProfileReport(train)

In [723]:
# Train
cutoff = 2017
temp=train.copy()
train = temp[temp['year_sold'] < 2017]
val  = temp[temp['year_sold'] >= 2017]
print(train.shape, val.shape, test.shape)

(15686, 27) (1932, 27) (1879, 27)


In [724]:
# Encode and fit a linear regression model

target = 'sale_price'

features = train.columns.drop(target)
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

pipeline = make_pipeline(
  ce.OrdinalEncoder(),
  SimpleImputer(strategy='mean'), 
  LinearRegression()
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)

# Print regression metrics for validation 
val_mse = mean_squared_error(y_val, y_pred)
val_rmse = np.sqrt(val_mse)
val_mae = mean_absolute_error(y_val, y_pred)
val_r2 = r2_score(y_val, y_pred)
print('Validation Mean Absolute Error:', val_mae)
print('Validation R^2:', val_r2)
print('\n')

ty_pred = pipeline.predict(X_test)

# Print regression metrics for test
test_mse = mean_squared_error(y_test, ty_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, ty_pred)
test_r2 = r2_score(y_test, ty_pred)
print('Test Mean Absolute Error:', test_mae)
print('Test R^2:', test_r2)
print('\n')

Validation Mean Absolute Error: 251052.87685525455
Validation R^2: 0.8040942501754428


Test Mean Absolute Error: 267695.5170381311
Test R^2: 0.7855862247861871




In [725]:
X_train.head()

Unnamed: 0,longitude,latitude,elevation,zip,district_no,district_desc,nid,neighborhood,rooms,baths,beds,sf,lot_sf,year_built,lot_desc,drive_side,park_leased,num_parking,shopping,transportation,type,year_sold,ds_count,parking_count,view_count,comp_price_sf
0,-122.50965,37.78028,200.83,94121,1,SF District 1,1050,1 - Outer Richmond,8.0,3.5,4,2691.0,,1969,"RGLR,FNCD","PVDW,PVSW",0,2,4BLK,1BLK,3STR,2013,2,2,4,682.521987
1,-122.50929,37.762608,23.21,94122,2,SF District 2,2030,2 - Outer Sunset,9.0,3.0,4,2437.0,3000.0,1947,RGLR,0,ONST,1,2BLK,1BLK,0,2016,1,4,1,675.030776
3,-122.50924,37.77733,189.11,94121,1,SF District 1,1050,1 - Outer Richmond,5.0,1.0,2,1312.0,,1939,RGLR,"PVDW,PVSW",0,2,3BLK,1BLK,"ATAC,2STR,FIXR",2010,2,1,3,978.150406
4,-122.50919,37.776695,175.89,94121,1,SF District 1,1050,1 - Outer Richmond,8.0,4.0,4,3307.0,,1951,RGLR,PVDW,0,3,4BLK,2BLK,3STR,2012,1,4,4,869.972785
5,-122.509186,37.761005,23.69,94122,2,SF District 2,2030,2 - Outer Sunset,4.0,1.0,3,1168.0,,1947,"RGLR,LEVL",PVDW,0,2,0,1BLK,ATAC,2009,1,3,1,866.438356


In [726]:
final = test.copy()
final['prediction'] = ty_pred
final['difference'] = final['sale_price'] - final['prediction']
final['prediction'] = final['prediction'].astype(int)
final['difference'] = final['difference'].astype(int)
final.head()

Unnamed: 0,longitude,latitude,elevation,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,lot_desc,drive_side,park_leased,num_parking,shopping,transportation,type,year_sold,ds_count,parking_count,view_count,comp_price_sf,prediction,difference
0,-122.50919,37.776695,175.89,94121,1,SF District 1,1050,1 - Outer Richmond,3310000,8.0,4.0,4,3735.0,3959.0,1951,0,0,ONST,3,3BLK,1BLK,2STR,2018,1,4,8,887.99643,3430200,-120200
2,-122.509056,37.75801,23.01,94122,2,SF District 2,2030,2 - Outer Sunset,1830000,7.0,3.0,4,2120.0,3072.0,1948,"RGLR,OCNF",0,ONST,1,2BLK,2BLK,FULL,2018,1,1,1,796.383648,1831371,-1371
4,-122.50902,37.758636,22.7,94122,2,SF District 2,2030,2 - Outer Sunset,1300000,3.0,1.0,2,1087.0,1245.0,1944,0,"PVDW,PVSW",ONST,1,1BLK,1BLK,0,2018,2,1,1,1232.75069,1355677,-55677
5,-122.50898,37.77368,40.08,94121,1,SF District 1,1050,1 - Outer Richmond,1300000,6.0,1.5,2,1326.0,2696.0,1942,"RGLR,LEVL,FNCD","PVDW,PVSW",ONST,2,1BLK,1BLK,"JR,2STR",2018,2,1,1,1045.751634,1489655,-189655
7,-122.50827,37.778465,225.66,94121,1,SF District 1,1050,1 - Outer Richmond,1865000,6.0,2.0,3,1800.0,3000.0,1931,RGLR,"PVDW,PVSW",ONST,2,0,1BLK,"FULL,2STR",2018,2,4,1,904.62963,1794043,70956


In [727]:
final.shape

(1879, 29)

In [0]:
# Calculate the agent_pred using the mean neighborhood price/sf * square footage of the house
def agent_calc(nhood, sqft):
  a_pred = nhood_dict[nhood] * sqft
  return a_pred

final['pred_percent'] = final['difference']/final['prediction']

In [729]:
final.head(25)

Unnamed: 0,longitude,latitude,elevation,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,lot_desc,drive_side,park_leased,num_parking,shopping,transportation,type,year_sold,ds_count,parking_count,view_count,comp_price_sf,prediction,difference,pred_percent
0,-122.50919,37.776695,175.89,94121,1,SF District 1,1050,1 - Outer Richmond,3310000,8.0,4.0,4,3735.0,3959.0,1951,0,0,ONST,3,3BLK,1BLK,2STR,2018,1,4,8,887.99643,3430200,-120200,-0.035042
2,-122.509056,37.75801,23.01,94122,2,SF District 2,2030,2 - Outer Sunset,1830000,7.0,3.0,4,2120.0,3072.0,1948,"RGLR,OCNF",0,ONST,1,2BLK,2BLK,FULL,2018,1,1,1,796.383648,1831371,-1371,-0.000749
4,-122.50902,37.758636,22.7,94122,2,SF District 2,2030,2 - Outer Sunset,1300000,3.0,1.0,2,1087.0,1245.0,1944,0,"PVDW,PVSW",ONST,1,1BLK,1BLK,0,2018,2,1,1,1232.75069,1355677,-55677,-0.04107
5,-122.50898,37.77368,40.08,94121,1,SF District 1,1050,1 - Outer Richmond,1300000,6.0,1.5,2,1326.0,2696.0,1942,"RGLR,LEVL,FNCD","PVDW,PVSW",ONST,2,1BLK,1BLK,"JR,2STR",2018,2,1,1,1045.751634,1489655,-189655,-0.127315
7,-122.50827,37.778465,225.66,94121,1,SF District 1,1050,1 - Outer Richmond,1865000,6.0,2.0,3,1800.0,3000.0,1931,RGLR,"PVDW,PVSW",ONST,2,0,1BLK,"FULL,2STR",2018,2,4,1,904.62963,1794043,70956,0.039551
8,-122.50825,37.777084,193.52,94121,1,SF District 1,1050,1 - Outer Richmond,3200000,9.5,4.5,5,3221.0,2696.0,1941,"RGLR,LEVL,FNCD","PVDW,PVSW",ONST,1,0,0,2STR,2018,2,4,4,1069.12967,2909914,290085,0.099689
9,-122.50824,37.75144,24.55,94122,2,SF District 2,2030,2 - Outer Sunset,1900000,5.0,1.0,2,995.0,1957.0,1944,0,0,ONST,2,0,0,0,2018,1,1,1,1407.035176,1411676,488323,0.345917
10,-122.50816,37.761627,28.22,94122,2,SF District 2,2030,2 - Outer Sunset,1200000,5.0,1.0,2,1177.0,3000.0,1946,"RGLR,LEVL","PVDW,PVSW",ONST,1,1BLK,1BLK,2STR,2018,2,4,1,1101.812518,1394305,-194305,-0.139356
11,-122.508125,37.776707,175.96,94121,1,SF District 1,1050,1 - Outer Richmond,1711400,6.0,3.0,3,2027.0,3000.0,1925,RGLR,PVDW,ONST,1,0,2BLK,DETC,2018,1,2,3,782.996218,1904634,-193234,-0.101455
12,-122.50788,37.78049,254.55,94121,1,SF District 1,1050,1 - Outer Richmond,1750000,8.0,2.0,3,2305.0,1999.0,1941,0,0,ONST,1,0,1BLK,2STR,2018,1,2,1,788.141721,1915588,-165588,-0.086442


In [730]:
pred_median_error = final['pred_percent'].median()
pred_five_percent = (final['pred_percent'][(final['pred_percent'] >= -.05) &
                                           (final['pred_percent'] <= .05)].count())/final.shape[0]

pred_ten_percent = (final['pred_percent'][(final['pred_percent'] >= -.10) &
                                          (final['pred_percent'] <= .10)].count())/final.shape[0]

pred_twenty_percent = (final['pred_percent'][(final['pred_percent'] >= -.20) &
                                             (final['pred_percent'] <= .20)].count())/final.shape[0]

print('Median Error - %.4f%%' % (pred_median_error * 100))
print('Prediction Within 5 percent - %.4f%%' % (pred_five_percent * 100))
print('Prediction Within 10 percent - %.4f%%' % (pred_ten_percent * 100))
print('Prediction Within 20 percent - %.4f%%' % (pred_twenty_percent * 100))

Median Error - -1.6203%
Prediction Within 5 percent - 25.5455%
Prediction Within 10 percent - 47.1527%
Prediction Within 20 percent - 78.7653%


In [0]:
# mae = np.sum(np.absolute((final['agent_diff'])))/final['agent_diff'].count()
# print(mae)

In [0]:
# data = final[['full_address', 'neighborhood', 'nid', 'sale_price', 'prediction', 'difference']]
# data = data.drop_duplicates()

# data.head()

In [0]:
# data.shape

In [0]:
# data.to_csv('/content/display_data.csv')