<a href="https://colab.research.google.com/github/JimKing100/Jestimate/blob/master/linear_regression_comps/LR_Comps2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [210]:
!pip install category_encoders==2.0.0
!pip install fiona
!pip install geopy



In [0]:
# Import libraries
import pandas as pd
import numpy as np
import math

import pandas_profiling

from datetime import datetime

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.cluster import KMeans
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report

import statsmodels.api as sm

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from geopy.distance import vincenty as get_geodesic_distance

from bokeh.models import ColumnDataSource, TableColumn, DataTable
from bokeh.layouts import column
from bokeh.models.widgets import TextInput
from bokeh.plotting import curdoc

In [212]:
# Load SF real estate data - 10 years (2009-2018) of single family home sales in San Francisco downloaded from the SF MLS
# Longitude and latitude were added to the csv file prior to loading using geocoding.geo.census.gov 
df = pd.read_csv('https://raw.githubusercontent.com/JimKing100/Jestimate/master/data/SF-SFR-Sales-Final2d.csv')

# Rename subdistr_desc to neighborhood
df = df.rename(columns={'subdist_no': 'nid', 'subdist_desc': 'neighborhood'})

# Create subset with outliers removed - 1.6%of the data
mask = (
  (df['baths'] < 6) &
  (df['beds'] < 7) &
  (df['beds'] > 0) &
  (df['lot_sf'] < 10000) &
  (df['rooms'] < 13) &
  (df['sale_price'] < 10000000) &
  (df['sf'] < 10000) &
  (df['sf'] > 100)
)
df = df[mask]

# Check the data
print(df.shape)
df.head(5)

(19497, 39)


Unnamed: 0,longitude,latitude,elevation,full_address,city,state,street_no,street_name,street_suffix,zip,area,district_no,district_desc,nid,neighborhood,on_market_date,cdom,orig_list_price,sale_date,sale_price,rooms,baths,beds,sf_source,sf_source_decs,sf,lot_acres,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views
0,-122.50965,37.78028,200.83,"2645 El Camino Del Mar, San Francisco, CA 94121",San Francisco,CA,2645,El Camino Del Mar,,94121,1050,1,SF District 1,1050,1 - Outer Richmond,3/14/13,4,1095000,3/22/13,1260000,8,3.5,4,T,Per Tax Records,2691,,0,1969,RH2,"RGLR,FNCD","PVDW,PVSW","ATCH,GARG",0,2,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN"
1,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",San Francisco,CA,1278,La Playa,St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,12/3/15,144,1250000,4/25/16,1075000,9,3.0,4,T,Per Tax Records,2437,0.0689,3000,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0
2,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",San Francisco,CA,1278,La Playa,St,94122,2030,2,SF District 2,2030,2 - Outer Sunset,5/18/17,36,1395000,8/17/17,1525000,9,5.0,5,D,Per Architect,2597,0.0689,3000,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0
3,-122.50924,37.77733,189.11,"590 48th Ave, San Francisco, CA 94121",San Francisco,CA,590,48th,Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,6/27/10,42,725000,8/20/10,715000,5,1.0,2,T,Per Tax Records,1312,,0,1939,RH1,RGLR,"PVDW,PVSW",GARG,0,2,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK"
4,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121",San Francisco,CA,618,48th,Ave,94121,1050,1,SF District 1,1050,1 - Outer Richmond,6/7/12,24,1595000,7/13/12,1595000,8,4.0,4,T,Per Tax Records,3307,,0,1951,,RGLR,PVDW,"ATCH,GARG,ATDR,INAC",0,3,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK"


In [0]:
#pandas_profiling.ProfileReport(df)

In [214]:
# Train, test split on date of 01/01/2018
df['sale_date'] = pd.to_datetime(df['sale_date'], infer_datetime_format=True)
df['year_sold'] = df['sale_date'].dt.year
df['month_sold'] = df['sale_date'].dt.month
df['day_sold'] = df['sale_date'].dt.day

df['on_market_date'] = pd.to_datetime(df['on_market_date'], infer_datetime_format=True)
df['year_on_market'] = df['on_market_date'].dt.year
df['month_on_market'] = df['on_market_date'].dt.month
df['day_on_market'] = df['on_market_date'].dt.day

low_cutoff = 2008
high_cutoff = 2018
train = df[(df['year_sold'] >= low_cutoff) & (df['year_sold'] < high_cutoff)]
test  = df[df['year_sold'] >= high_cutoff]
print(train.shape)
print(test.shape)

(17618, 45)
(1879, 45)


In [215]:
# Wrangle the data for train and test
def engineer_features(X):
  
  # Fill house square foot zero values with the average house square footage by bedroom for all single family homes in SF
  averagesf_data = X.groupby('beds').sf.mean()

  # Use average sf by bedroom for each 0 value in each bedroom group 1 - 6 bedrooms
  for i in range(1, 7): 
    X.loc[(X['sf'] == 0) & (X['beds'] == i), 'sf'] = averagesf_data.loc[i]
  
  # Impute mean for null long/lat/elev based on mean of of neighborhood
  def feature_calc(feature, nid, f_dict):
    if math.isnan(feature):
      if (nid) in f_dict:
        new_feature = f_dict[nid]
        return new_feature
    else:
      return feature
    
    return feature
  
  temp = X[~X['longitude'].isna()].groupby(['nid'])['longitude'].mean()
  long_dict = dict(temp)
  X['longitude'] = X.apply(lambda x: feature_calc(x['longitude'], x['nid'], long_dict), axis=1)
  
  temp = X[~X['latitude'].isna()].groupby(['nid'])['latitude'].mean()
  lat_dict = dict(temp)
  X['latitude'] = X.apply(lambda x: feature_calc(x['latitude'], x['nid'], lat_dict), axis=1)
  
  temp = X[~X['elevation'].isna()].groupby(['nid'])['elevation'].mean()
  elev_dict = dict(temp)
  X['elevation'] = X.apply(lambda x: feature_calc(x['elevation'], x['nid'], elev_dict), axis=1)
  
  X['zip'] = X['zip'].astype(int)
  
  # Fill rooms zero values by adding beds and baths
  def room_calc(rooms_val, beds_val, baths_val):
    if rooms_val == 0:
      total = beds_val + baths_val
    else:
      total = rooms_val
      
    return total
  
  X['rooms'] = X.apply(lambda x: room_calc(x['rooms'], x['beds'], x['baths']), axis=1)
  
  # Fill baths zero values by adding beds and baths
  X.loc[(X['baths'] == 0), 'baths'] = 1
  
  # Fill lot_sf zero values by using lot_acres to calc
  def lotsf_calc(lotsf_val, lotacres_val):
    if lotsf_val == 0:
      total = lotacres_val * 43560
    else:
      total = lotsf_val
      
    return total
  
  X['lot_sf'] = X.apply(lambda x: lotsf_calc(x['lot_sf'], x['lot_acres']), axis=1)
  
  # Fill lot_acres zero values by using lot_sf to calc
  def lotacres_calc(lotacres_val, lotsf_val):
    if lotacres_val == 0:
      total = lotsf_val / 43560
    else:
      total = lotacres_val
      
    return total
  
  X['lot_acres'] = X.apply(lambda x: lotacres_calc(x['lot_acres'], x['lot_sf']), axis=1)
  
  # Engineer new feature mean_neighbor_price THIS TAKES 1.5 HOURS TO RUN
  # Output was saved to nhoods files and is loaded later to save time
  nhoods = X[['sf', 'longitude', 'latitude']]
  
  def neighbor_mean(sqft, source_latitude, source_longitude):
    
    source_latlong = source_latitude, source_longitude
    source_table = X[(X['sf'] >= (sqft * .9)) & (X['sf'] <= (sqft * 1.1))]
    target_table = pd.DataFrame(source_table, columns = ['latitude', 'longitude', 'year_sold', 'sale_price']) 

    def get_distance(row):
        target_latlong = row['latitude'], row['longitude']
        return get_geodesic_distance(target_latlong, source_latlong).meters

    target_table['distance'] = target_table.apply(get_distance, axis=1)

    # Get the nearest 3 locations
    nearest_target_table = target_table.sort_values(['year_sold', 'distance'], ascending=[False, True])[:3]

    return nearest_target_table['sale_price'].mean()/sqft

  #nhoods['mean_hood_ppsf'] = X.apply(lambda x: neighbor_mean(x['sf'], x['latitude'], x['longitude']), axis=1)
  #nhoods.to_csv('/content/nhoods.csv')
  
#   # Features with nan's that should be zero's
#   cols_with_nans = ['lot_sf', 'lot_acres', 'year_built', 'zoning']
#   for col in cols_with_nans:
#     X[col] = X[col].replace(np.nan, 0)
    
  # Engineer a price_sf column
  #X['price_sf'] = X['sale_price'] / X['sf']
  
  # Drop unneeded columns
  unneeded_columns = ['sale_date', 'on_market_date', 'city', 'state', 'street_no', 'street_name', 'street_suffix',
                      'day_on_market', 'month_on_market', 'year_on_market', 'month_sold', 'day_sold', 'orig_list_price', 'cdom',
                      'sf_source', 'area', 'sf_source_decs', 'lot_acres']
  X = X.drop(columns=unneeded_columns)
  
  return X

train = engineer_features(train)
test = engineer_features(test)

train.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

Unnamed: 0,longitude,latitude,elevation,full_address,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold
0,-122.50965,37.78028,200.83,"2645 El Camino Del Mar, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1260000,8.0,3.5,4,2691.0,,1969,RH2,"RGLR,FNCD","PVDW,PVSW","ATCH,GARG",0,2,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN",2013
1,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1075000,9.0,3.0,4,2437.0,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2016
2,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1525000,9.0,5.0,5,2597.0,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2017
3,-122.50924,37.77733,189.11,"590 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,715000,5.0,1.0,2,1312.0,,1939,RH1,RGLR,"PVDW,PVSW",GARG,0,2,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK",2010
4,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1595000,8.0,4.0,4,3307.0,,1951,,RGLR,PVDW,"ATCH,GARG,ATDR,INAC",0,3,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK",2012


In [0]:
rnhoods = pd.read_csv('https://raw.githubusercontent.com/JimKing100/Jestimate/master/data/nhoods-train.csv')
tnhoods = pd.read_csv('https://raw.githubusercontent.com/JimKing100/Jestimate/master/data/nhoods-test.csv')
train = pd.merge(train, rnhoods[['old_index', 'mean_hood_ppsf']], left_index=True, right_on='old_index')
test = pd.merge(test, tnhoods[['old_index', 'mean_hood_ppsf']], left_index=True, right_on='old_index')
train = train.drop(columns=['old_index'])
test = test.drop(columns=['old_index'])

In [217]:
print(train.shape)
train.head()

(17618, 28)


Unnamed: 0,longitude,latitude,elevation,full_address,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold,mean_hood_ppsf
0,-122.50965,37.78028,200.83,"2645 El Camino Del Mar, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1260000,8.0,3.5,4,2691.0,,1969,RH2,"RGLR,FNCD","PVDW,PVSW","ATCH,GARG",0,2,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN",2013,682.521987
1,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1075000,9.0,3.0,4,2437.0,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2016,675.030776
2,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1525000,9.0,5.0,5,2597.0,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2017,606.488256
3,-122.50924,37.77733,189.11,"590 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,715000,5.0,1.0,2,1312.0,,1939,RH1,RGLR,"PVDW,PVSW",GARG,0,2,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK",2010,978.150406
4,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1595000,8.0,4.0,4,3307.0,,1951,,RGLR,PVDW,"ATCH,GARG,ATDR,INAC",0,3,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK",2012,869.972785


In [218]:
print(test.shape)
test.head()

(1879, 28)


Unnamed: 0,longitude,latitude,elevation,full_address,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold,mean_hood_ppsf
0,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121-2410",94121,1,SF District 1,1050,1 - Outer Richmond,3310000,8.0,4.0,4,3735.0,3959.0,1951,,0,0,"ATCH,GARG,ATDR,INAC",ONST,3,3BLK,1BLK,2STR,"PNRM,CTYL,WATR,SFRN,OCEN,PARK,GRDN,TWNP",2018,887.99643
2,-122.509056,37.75801,23.01,"1516 Great Highway, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1830000,7.0,3.0,4,2120.0,3072.0,1948,RM1,"RGLR,OCNF",0,GARG,ONST,1,2BLK,2BLK,FULL,GRDN,2018,796.383648
4,-122.50902,37.758636,22.7,"1492 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1300000,3.0,1.0,2,1087.0,1245.0,1944,RM1,0,"PVDW,PVSW",ATCH,ONST,1,1BLK,1BLK,0,0,2018,1232.75069
5,-122.50898,37.77368,40.08,"784 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1300000,6.0,1.5,2,1326.0,2696.0,1942,RH2,"RGLR,LEVL,FNCD","PVDW,PVSW",GARG,ONST,2,1BLK,1BLK,"JR,2STR",0,2018,1045.751634
7,-122.50827,37.778465,225.66,"525 47th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1865000,6.0,2.0,3,1800.0,3000.0,1931,RH1,RGLR,"PVDW,PVSW","ATCH,GARG,ATDR,INAC",ONST,2,0,1BLK,"FULL,2STR",PRTL,2018,904.62963


In [219]:
pandas_profiling.ProfileReport(train)

0,1
Number of variables,29
Number of observations,17618
Total Missing (%),3.6%
Total size in memory,3.9 MiB
Average record size in memory,232.0 B

0,1
Numeric,15
Categorical,12
Boolean,0
Date,0
Text (Unique),0
Rejected,2
Unsupported,0

0,1
Distinct count,20
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.984
Minimum,1
Maximum,5.75
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,1.0
Q1,1.0
Median,2.0
Q3,2.5
95-th percentile,4.0
Maximum,5.75
Range,4.75
Interquartile range,1.5

0,1
Standard deviation,0.97809
Coef of variation,0.49298
Kurtosis,0.45412
Mean,1.984
MAD,0.7495
Skewness,0.92808
Sum,34954
Variance,0.95666
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
1.0,6130,34.8%,
2.0,5141,29.2%,
3.0,1886,10.7%,
2.5,1272,7.2%,
3.5,978,5.6%,
1.5,914,5.2%,
4.0,462,2.6%,
4.5,343,1.9%,
1.25,165,0.9%,
5.0,105,0.6%,

Value,Count,Frequency (%),Unnamed: 3
1.0,6130,34.8%,
1.25,165,0.9%,
1.5,914,5.2%,
1.75,29,0.2%,
2.0,5141,29.2%,

Value,Count,Frequency (%),Unnamed: 3
4.75,2,0.0%,
5.0,105,0.6%,
5.25,1,0.0%,
5.5,75,0.4%,
5.75,3,0.0%,

0,1
Distinct count,6
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2.9501
Minimum,1
Maximum,6
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,2
Q1,2
Median,3
Q3,3
95-th percentile,5
Maximum,6
Range,5
Interquartile range,1

0,1
Standard deviation,0.94496
Coef of variation,0.32031
Kurtosis,0.20706
Mean,2.9501
MAD,0.70841
Skewness,0.625
Sum,51975
Variance,0.89295
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
3,6995,39.7%,
2,5901,33.5%,
4,3298,18.7%,
5,923,5.2%,
1,325,1.8%,
6,176,1.0%,

Value,Count,Frequency (%),Unnamed: 3
1,325,1.8%,
2,5901,33.5%,
3,6995,39.7%,
4,3298,18.7%,
5,923,5.2%,

Value,Count,Frequency (%),Unnamed: 3
2,5901,33.5%,
3,6995,39.7%,
4,3298,18.7%,
5,923,5.2%,
6,176,1.0%,

0,1
Distinct count,10
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
SF District 10,4438
SF District 2,3394
SF District 4,2305
Other values (7),7481

Value,Count,Frequency (%),Unnamed: 3
SF District 10,4438,25.2%,
SF District 2,3394,19.3%,
SF District 4,2305,13.1%,
SF District 5,2094,11.9%,
SF District 9,1638,9.3%,
SF District 3,1468,8.3%,
SF District 1,1384,7.9%,
SF District 7,540,3.1%,
SF District 6,235,1.3%,
SF District 8,122,0.7%,

0,1
Distinct count,10
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5.5372
Minimum,1
Maximum,10
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,2
Median,5
Q3,10
95-th percentile,10
Maximum,10
Range,9
Interquartile range,8

0,1
Standard deviation,3.3392
Coef of variation,0.60305
Kurtosis,-1.5415
Mean,5.5372
MAD,3.0284
Skewness,0.23462
Sum,97554
Variance,11.15
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
10,4438,25.2%,
2,3394,19.3%,
4,2305,13.1%,
5,2094,11.9%,
9,1638,9.3%,
3,1468,8.3%,
1,1384,7.9%,
7,540,3.1%,
6,235,1.3%,
8,122,0.7%,

Value,Count,Frequency (%),Unnamed: 3
1,1384,7.9%,
2,3394,19.3%,
3,1468,8.3%,
4,2305,13.1%,
5,2094,11.9%,

Value,Count,Frequency (%),Unnamed: 3
6,235,1.3%,
7,540,3.1%,
8,122,0.7%,
9,1638,9.3%,
10,4438,25.2%,

0,1
Distinct count,14
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
0,9176
PVDW,4070
"PVDW,PVSW",3751
Other values (11),621

Value,Count,Frequency (%),Unnamed: 3
0,9176,52.1%,
PVDW,4070,23.1%,
"PVDW,PVSW",3751,21.3%,
PVSW,519,2.9%,
SHDW,31,0.2%,
"PVDW,SHDW",30,0.2%,
"PVDW,PVSW,SHDW",17,0.1%,
"PVDW,PVSW,GRVL",6,0.0%,
GRVL,6,0.0%,
"PVSW,SHDW",5,0.0%,

0,1
Distinct count,12291
Unique (%),69.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,249.55
Minimum,8.37
Maximum,825.5
Zeros (%),0.0%

0,1
Minimum,8.37
5-th percentile,43.499
Q1,135.81
Median,227.81
Q3,331.14
95-th percentile,562.25
Maximum,825.5
Range,817.13
Interquartile range,195.33

0,1
Standard deviation,153.15
Coef of variation,0.6137
Kurtosis,0.54861
Mean,249.55
MAD,120.38
Skewness,0.86071
Sum,4396600
Variance,23455
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
587.8480415430273,48,0.3%,
624.6382352941177,21,0.1%,
75.95171014492769,16,0.1%,
275.3533716475093,15,0.1%,
159.17337837837832,14,0.1%,
525.6753333333334,14,0.1%,
307.91705645161295,13,0.1%,
171.41644541484706,13,0.1%,
389.3853781512606,12,0.1%,
591.2887136929467,10,0.1%,

Value,Count,Frequency (%),Unnamed: 3
8.37,1,0.0%,
9.55,1,0.0%,
9.86,1,0.0%,
10.02,1,0.0%,
10.04,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
794.61,1,0.0%,
796.11,1,0.0%,
797.55,1,0.0%,
815.67,1,0.0%,
825.5,1,0.0%,

0,1
Distinct count,16578
Unique (%),94.1%
Missing (%),0.0%
Missing (n),0

0,1
"19 Del Monte St, San Francisco, CA 94112",4
"2 Valdez Ave, San Francisco, CA 94112",3
"4326 Cesar Chavez St, San Francisco, CA 94131",3
Other values (16575),17608

Value,Count,Frequency (%),Unnamed: 3
"19 Del Monte St, San Francisco, CA 94112",4,0.0%,
"2 Valdez Ave, San Francisco, CA 94112",3,0.0%,
"4326 Cesar Chavez St, San Francisco, CA 94131",3,0.0%,
"3964 20th St, San Francisco, CA 94114",3,0.0%,
"1554 Shafter Ave, San Francisco, CA 94124",3,0.0%,
"401 Gates St, San Francisco, CA 94110",3,0.0%,
"491 Ulloa St, San Francisco, CA 94127",3,0.0%,
"2114 28th Ave, San Francisco, CA 94116",3,0.0%,
"2722 Irving St, San Francisco, CA 94122",3,0.0%,
"219 Gennessee St, San Francisco, CA 94112",3,0.0%,

0,1
Distinct count,17618
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,10352
Minimum,0
Maximum,21023
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,1006.9
Q1,4901.2
Median,10208.0
Q3,15818.0
95-th percentile,20009.0
Maximum,21023.0
Range,21023.0
Interquartile range,10917.0

0,1
Standard deviation,6172.3
Coef of variation,0.59623
Kurtosis,-1.2437
Mean,10352
MAD,5377.9
Skewness,0.046027
Sum,182384724
Variance,38098000
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
19180,1,0.0%,
15074,1,0.0%,
13027,1,0.0%,
2788,1,0.0%,
741,1,0.0%,
6886,1,0.0%,
4839,1,0.0%,
17133,1,0.0%,
6838,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
21019,1,0.0%,
21020,1,0.0%,
21021,1,0.0%,
21022,1,0.0%,
21023,1,0.0%,

0,1
Distinct count,10458
Unique (%),59.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,37.743
Minimum,37.708
Maximum,37.806
Zeros (%),0.0%

0,1
Minimum,37.708
5-th percentile,37.712
Q1,37.728
Median,37.74
Q3,37.756
95-th percentile,37.786
Maximum,37.806
Range,0.098547
Interquartile range,0.027981

0,1
Standard deviation,0.021612
Coef of variation,0.00057262
Kurtosis,-0.052226
Mean,37.743
MAD,0.017264
Skewness,0.6372
Sum,664960
Variance,0.00046709
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
37.73880935014838,48,0.3%,
37.75493761764706,21,0.1%,
37.7305745898551,16,0.1%,
37.7482866883781,15,0.1%,
37.733492677777804,14,0.1%,
37.78556281081082,14,0.1%,
37.714377725806486,13,0.1%,
37.739779915283854,13,0.1%,
37.733090000000004,12,0.1%,
37.736419613445356,12,0.1%,

Value,Count,Frequency (%),Unnamed: 3
37.707893,1,0.0%,
37.708256,1,0.0%,
37.708267,2,0.0%,
37.708275,2,0.0%,
37.70828,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
37.80584,1,0.0%,
37.805847,1,0.0%,
37.806084000000006,1,0.0%,
37.806340000000006,1,0.0%,
37.80644,1,0.0%,

0,1
Correlation,0.95796

0,1
Distinct count,239
Unique (%),1.4%
Missing (%),0.0%
Missing (n),0

0,1
0,7840
RGLR,3936
"RGLR,LEVL",697
Other values (236),5145

Value,Count,Frequency (%),Unnamed: 3
0,7840,44.5%,
RGLR,3936,22.3%,
"RGLR,LEVL",697,4.0%,
CRNR,541,3.1%,
LEVL,524,3.0%,
"RGLR,FNCD",505,2.9%,
"RGLR,LEVL,FNCD",415,2.4%,
DWNS,396,2.2%,
UPSL,388,2.2%,
FNCD,217,1.2%,

0,1
Distinct count,1426
Unique (%),8.1%
Missing (%),59.2%
Missing (n),10422
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2816.1
Minimum,1
Maximum,9997
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,1698.0
Q1,2374.0
Median,2626.0
Q3,3000.0
95-th percentile,4748.5
Maximum,9997.0
Range,9996.0
Interquartile range,626.0

0,1
Standard deviation,970.25
Coef of variation,0.34453
Kurtosis,7.4084
Mean,2816.1
MAD,650.86
Skewness,1.9816
Sum,20265000
Variance,941390
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
2495.0,728,4.1%,
2996.0,612,3.5%,
2500.0,587,3.3%,
3000.0,495,2.8%,
1750.0,172,1.0%,
1873.0,108,0.6%,
1751.0,87,0.5%,
1746.0,84,0.5%,
2848.0,80,0.5%,
2809.0,65,0.4%,

Value,Count,Frequency (%),Unnamed: 3
1.0,1,0.0%,
17.0,1,0.0%,
26.0,2,0.0%,
459.0,1,0.0%,
566.0,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
9600.0,1,0.0%,
9717.0,1,0.0%,
9748.0,1,0.0%,
9901.0,1,0.0%,
9997.0,1,0.0%,

0,1
Distinct count,14448
Unique (%),82.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,948.92
Minimum,353.38
Maximum,2884.4
Zeros (%),0.0%

0,1
Minimum,353.38
5-th percentile,572.32
Q1,771.21
Median,912.91
Q3,1071.4
95-th percentile,1480.4
Maximum,2884.4
Range,2531.0
Interquartile range,300.21

0,1
Standard deviation,277.52
Coef of variation,0.29245
Kurtosis,3.3798
Mean,948.92
MAD,203.98
Skewness,1.278
Sum,16718000
Variance,77016
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
1242.5,24,0.1%,
793.4059637,17,0.1%,
1146.666667,17,0.1%,
1059.947299,15,0.1%,
915.2380952,14,0.1%,
1060.9957140000001,13,0.1%,
910.5882353000001,13,0.1%,
893.3333332999998,13,0.1%,
1337.066667,12,0.1%,
1157.647059,12,0.1%,

Value,Count,Frequency (%),Unnamed: 3
353.37552739999995,1,0.0%,
357.3333333,1,0.0%,
357.5357536,1,0.0%,
358.35886960000005,1,0.0%,
359.3456691,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2777.777778,1,0.0%,
2786.419753,1,0.0%,
2830.448959,1,0.0%,
2831.196581,1,0.0%,
2884.353741,2,0.0%,

0,1
Distinct count,84
Unique (%),0.5%
Missing (%),0.0%
Missing (n),0

0,1
9 - Bernal Heights,1158
5 - Noe Valley,798
10 - Excelsior,778
Other values (81),14884

Value,Count,Frequency (%),Unnamed: 3
9 - Bernal Heights,1158,6.6%,
5 - Noe Valley,798,4.5%,
10 - Excelsior,778,4.4%,
2 - Parkside,722,4.1%,
10 - Bayview,706,4.0%,
2 - Central Sunset,664,3.8%,
2 - Outer Parkside,658,3.7%,
10 - Portola,645,3.7%,
10 - Visitacion Valley,585,3.3%,
2 - Outer Sunset,532,3.0%,

0,1
Correlation,0.99995

0,1
Distinct count,8
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.4854
Minimum,0
Maximum,7
Zeros (%),5.8%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,2
95-th percentile,3
Maximum,7
Range,7
Interquartile range,1

0,1
Standard deviation,0.75382
Coef of variation,0.5075
Kurtosis,2.8047
Mean,1.4854
MAD,0.63281
Skewness,0.80159
Sum,26169
Variance,0.56824
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
1,8333,47.3%,
2,7283,41.3%,
0,1030,5.8%,
3,689,3.9%,
4,231,1.3%,
5,38,0.2%,
6,9,0.1%,
7,5,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1030,5.8%,
1,8333,47.3%,
2,7283,41.3%,
3,689,3.9%,
4,231,1.3%,

Value,Count,Frequency (%),Unnamed: 3
3,689,3.9%,
4,231,1.3%,
5,38,0.2%,
6,9,0.1%,
7,5,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
0,10687
ONST,6901
LOST,30

Value,Count,Frequency (%),Unnamed: 3
0,10687,60.7%,
ONST,6901,39.2%,
LOST,30,0.2%,

0,1
Distinct count,140
Unique (%),0.8%
Missing (%),0.0%
Missing (n),0

0,1
"ATCH,GARG,ATDR,INAC",2983
GARG,2727
"ATCH,GARG",1796
Other values (137),10112

Value,Count,Frequency (%),Unnamed: 3
"ATCH,GARG,ATDR,INAC",2983,16.9%,
GARG,2727,15.5%,
"ATCH,GARG",1796,10.2%,
0,1738,9.9%,
"ATCH,GARG,INAC",1236,7.0%,
"GARG,INAC",1225,7.0%,
"GARG,ATDR,INAC",1140,6.5%,
"ATCH,GARG,ATDR",1038,5.9%,
ATCH,959,5.4%,
"GARG,ATDR",723,4.1%,

0,1
Distinct count,36
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6.0781
Minimum,1
Maximum,12
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,3
Q1,5
Median,6
Q3,7
95-th percentile,9
Maximum,12
Range,11
Interquartile range,2

0,1
Standard deviation,1.7821
Coef of variation,0.2932
Kurtosis,0.31233
Mean,6.0781
MAD,1.3858
Skewness,0.5619
Sum,107080
Variance,3.1758
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
5.0,4269,24.2%,
6.0,3669,20.8%,
7.0,2555,14.5%,
4.0,1658,9.4%,
8.0,1534,8.7%,
3.0,928,5.3%,
9.0,818,4.6%,
10.0,425,2.4%,
5.5,291,1.7%,
7.5,275,1.6%,

Value,Count,Frequency (%),Unnamed: 3
1.0,5,0.0%,
2.0,58,0.3%,
2.5,5,0.0%,
3.0,928,5.3%,
3.25,17,0.1%,

Value,Count,Frequency (%),Unnamed: 3
10.5,42,0.2%,
11.0,188,1.1%,
11.5,5,0.0%,
11.75,1,0.0%,
12.0,88,0.5%,

0,1
Distinct count,2883
Unique (%),16.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1177600
Minimum,105000
Maximum,9998000
Zeros (%),0.0%

0,1
Minimum,105000
5-th percentile,403000
Q1,653920
Median,908440
Q3,1401000
95-th percentile,2800000
Maximum,9998000
Range,9893000
Interquartile range,747100

0,1
Standard deviation,886070
Coef of variation,0.75247
Kurtosis,16.659
Mean,1177600
MAD,582950
Skewness,3.198
Sum,20746132960
Variance,785120000000
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
750000,170,1.0%,
1100000,163,0.9%,
850000,154,0.9%,
1200000,151,0.9%,
1250000,150,0.9%,
1300000,149,0.8%,
950000,144,0.8%,
1050000,137,0.8%,
800000,137,0.8%,
650000,133,0.8%,

Value,Count,Frequency (%),Unnamed: 3
105000,1,0.0%,
115000,1,0.0%,
120000,1,0.0%,
130000,2,0.0%,
133000,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
9700000,1,0.0%,
9708000,1,0.0%,
9750000,2,0.0%,
9950000,1,0.0%,
9998000,1,0.0%,

0,1
Distinct count,2902
Unique (%),16.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1744.2
Minimum,240
Maximum,9125
Zeros (%),0.0%

0,1
Minimum,240
5-th percentile,875
Q1,1200
Median,1537
Q3,2070
95-th percentile,3338
Maximum,9125
Range,8885
Interquartile range,870

0,1
Standard deviation,796.89
Coef of variation,0.45688
Kurtosis,5.2344
Mean,1744.2
MAD,586.94
Skewness,1.8087
Sum,30729000
Variance,635030
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
1250.0,239,1.4%,
1200.0,223,1.3%,
1000.0,221,1.3%,
1125.0,179,1.0%,
1500.0,172,1.0%,
1300.0,148,0.8%,
1150.0,141,0.8%,
1400.0,140,0.8%,
1100.0,134,0.8%,
1350.0,132,0.7%,

Value,Count,Frequency (%),Unnamed: 3
240.0,2,0.0%,
266.0,1,0.0%,
360.0,1,0.0%,
375.0,1,0.0%,
380.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
7435.0,1,0.0%,
7615.0,1,0.0%,
8000.0,1,0.0%,
8200.0,1,0.0%,
9125.0,1,0.0%,

0,1
Distinct count,14
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
0,7813
1BLK,2810
2BLK,2575
Other values (11),4420

Value,Count,Frequency (%),Unnamed: 3
0,7813,44.3%,
1BLK,2810,15.9%,
2BLK,2575,14.6%,
4BLK,2284,13.0%,
3BLK,2039,11.6%,
"1BLK,2BLK",32,0.2%,
"2BLK,3BLK",18,0.1%,
"3BLK,4BLK",13,0.1%,
"2BLK,4BLK",10,0.1%,
"1BLK,4BLK",9,0.1%,

0,1
Distinct count,16
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
1BLK,7394
0,6123
2BLK,2676
Other values (13),1425

Value,Count,Frequency (%),Unnamed: 3
1BLK,7394,42.0%,
0,6123,34.8%,
2BLK,2676,15.2%,
3BLK,931,5.3%,
4BLK,345,2.0%,
"1BLK,2BLK",71,0.4%,
"2BLK,3BLK",15,0.1%,
"1BLK,3BLK",14,0.1%,
"3BLK,4BLK",12,0.1%,
"1BLK,4BLK",12,0.1%,

0,1
Distinct count,242
Unique (%),1.4%
Missing (%),0.0%
Missing (n),0

0,1
0,3942
2STR,2647
3STR,1191
Other values (239),9838

Value,Count,Frequency (%),Unnamed: 3
0,3942,22.4%,
2STR,2647,15.0%,
3STR,1191,6.8%,
FULL,1138,6.5%,
"DETC,2STR",862,4.9%,
"FULL,2STR",713,4.0%,
"ATAC,2STR",683,3.9%,
DETC,616,3.5%,
ATAC,487,2.8%,
"DETC,3STR",369,2.1%,

0,1
Distinct count,1882
Unique (%),10.7%
Missing (%),0.0%
Missing (n),0

0,1
0,8165
CTYL,1216
OCEN,457
Other values (1879),7780

Value,Count,Frequency (%),Unnamed: 3
0,8165,46.3%,
CTYL,1216,6.9%,
OCEN,457,2.6%,
GRDN,447,2.5%,
PRTL,434,2.5%,
"PRTL,OCEN",228,1.3%,
HILL,225,1.3%,
"PRTL,CTYL",177,1.0%,
"CTYL,HILL",177,1.0%,
PNRM,139,0.8%,

0,1
Distinct count,153
Unique (%),0.9%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1852.5
Minimum,0
Maximum,2017
Zeros (%),4.2%

0,1
Minimum,0
5-th percentile,1890
Q1,1914
Median,1929
Q3,1947
95-th percentile,1977
Maximum,2017
Range,2017
Interquartile range,33

0,1
Standard deviation,389.18
Coef of variation,0.21009
Kurtosis,18.632
Mean,1852.5
MAD,156.13
Skewness,-4.5322
Sum,32636937
Variance,151460
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
1900,1045,5.9%,
0,742,4.2%,
1940,625,3.5%,
1941,600,3.4%,
1925,598,3.4%,
1924,543,3.1%,
1926,516,2.9%,
1927,489,2.8%,
1947,415,2.4%,
1939,408,2.3%,

Value,Count,Frequency (%),Unnamed: 3
0,742,4.2%,
1026,1,0.0%,
1852,1,0.0%,
1855,1,0.0%,
1865,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2013,17,0.1%,
2014,19,0.1%,
2015,14,0.1%,
2016,22,0.1%,
2017,10,0.1%,

0,1
Distinct count,9
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2013
Minimum,2009
Maximum,2017
Zeros (%),0.0%

0,1
Minimum,2009
5-th percentile,2009
Q1,2011
Median,2013
Q3,2015
95-th percentile,2017
Maximum,2017
Range,8
Interquartile range,4

0,1
Standard deviation,2.5313
Coef of variation,0.0012575
Kurtosis,-1.1755
Mean,2013
MAD,2.1591
Skewness,0.022413
Sum,35465164
Variance,6.4074
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
2012,2162,12.3%,
2013,2150,12.2%,
2011,2025,11.5%,
2017,1932,11.0%,
2014,1923,10.9%,
2015,1891,10.7%,
2010,1882,10.7%,
2016,1881,10.7%,
2009,1772,10.1%,

Value,Count,Frequency (%),Unnamed: 3
2009,1772,10.1%,
2010,1882,10.7%,
2011,2025,11.5%,
2012,2162,12.3%,
2013,2150,12.2%,

Value,Count,Frequency (%),Unnamed: 3
2013,2150,12.2%,
2014,1923,10.9%,
2015,1891,10.7%,
2016,1881,10.7%,
2017,1932,11.0%,

0,1
Distinct count,56
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,94121
Minimum,92124
Maximum,97124
Zeros (%),0.0%

0,1
Minimum,92124
5-th percentile,94110
Q1,94112
Median,94121
Q3,94127
95-th percentile,94134
Maximum,97124
Range,5000
Interquartile range,15

0,1
Standard deviation,44.181
Coef of variation,0.00046941
Kurtosis,2978
Mean,94121
MAD,8.0368
Skewness,37.032
Sum,1658218886
Variance,1952
Memory size,137.7 KiB

Value,Count,Frequency (%),Unnamed: 3
94112,2915,16.5%,
94116,1952,11.1%,
94122,1603,9.1%,
94127,1433,8.1%,
94110,1332,7.6%,
94134,1289,7.3%,
94131,1281,7.3%,
94124,1177,6.7%,
94114,924,5.2%,
94121,923,5.2%,

Value,Count,Frequency (%),Unnamed: 3
92124,1,0.0%,
93131,1,0.0%,
93132,1,0.0%,
93551,1,0.0%,
94010,2,0.0%,

Value,Count,Frequency (%),Unnamed: 3
95116,1,0.0%,
95411,1,0.0%,
96116,1,0.0%,
97114,1,0.0%,
97124,1,0.0%,

0,1
Distinct count,17
Unique (%),0.1%
Missing (%),46.6%
Missing (n),8209

0,1
RH1,5614
RH2,2032
RH1D,1248
Other values (13),515
(Missing),8209

Value,Count,Frequency (%),Unnamed: 3
RH1,5614,31.9%,
RH2,2032,11.5%,
RH1D,1248,7.1%,
RH3,251,1.4%,
RM1,128,0.7%,
OTHR,40,0.2%,
RH1S,23,0.1%,
RM2,17,0.1%,
NC2,14,0.1%,
NC3,13,0.1%,

Unnamed: 0,longitude,latitude,elevation,full_address,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold,mean_hood_ppsf
0,-122.50965,37.78028,200.83,"2645 El Camino Del Mar, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1260000,8.0,3.5,4,2691.0,,1969,RH2,"RGLR,FNCD","PVDW,PVSW","ATCH,GARG",0,2,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN",2013,682.521987
1,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1075000,9.0,3.0,4,2437.0,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2016,675.030776
2,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1525000,9.0,5.0,5,2597.0,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2017,606.488256
3,-122.50924,37.77733,189.11,"590 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,715000,5.0,1.0,2,1312.0,,1939,RH1,RGLR,"PVDW,PVSW",GARG,0,2,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK",2010,978.150406
4,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1595000,8.0,4.0,4,3307.0,,1951,,RGLR,PVDW,"ATCH,GARG,ATDR,INAC",0,3,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK",2012,869.972785


In [220]:
# Train
cutoff = 2017
temp=train.copy()
train = temp[temp['year_sold'] < 2017]
val  = temp[temp['year_sold'] >= 2017]
print(train.shape, val.shape, test.shape)

(15686, 28) (1932, 28) (1879, 28)


In [221]:
# Encode and fit a linear regression model

target = 'sale_price'

features = train.columns.drop(target)
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

pipeline = make_pipeline(
  ce.OrdinalEncoder(),
  SimpleImputer(strategy='mean'), 
  LinearRegression()
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)

# Print regression metrics for validation 
val_mse = mean_squared_error(y_val, y_pred)
val_rmse = np.sqrt(val_mse)
val_mae = mean_absolute_error(y_val, y_pred)
val_r2 = r2_score(y_val, y_pred)
print('Validation Mean Absolute Error:', val_mae)
print('Validation R^2:', val_r2)
print('\n')

ty_pred = pipeline.predict(X_test)

# Print regression metrics for test
test_mse = mean_squared_error(y_test, ty_pred)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, ty_pred)
test_r2 = r2_score(y_test, ty_pred)
print('Test Mean Absolute Error:', test_mae)
print('Test R^2:', test_r2)
print('\n')

Validation Mean Absolute Error: 255212.31592534177
Validation R^2: 0.8027615072982429


Test Mean Absolute Error: 273451.47748818365
Test R^2: 0.7840406477031912




In [0]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
# print(random_grid)
  
# pipeline = make_pipeline (
#   ce.OrdinalEncoder(), 
#   SimpleImputer(strategy='mean'),
#   RandomizedSearchCV(estimator = RandomForestRegressor(),
#                      param_distributions = random_grid,
#                      n_iter = 100,
#                      verbose=2,
#                      random_state=42,
#                      n_jobs = -1)
#   )

# pipeline.fit(X_train, y_train)

# pd.set_option('display.max_rows', 200)
# model = pipeline.named_steps['randomizedsearchcv']
# best = pd.Series(model.best_params_)
# print(best)

In [223]:
X_train.head()

Unnamed: 0,longitude,latitude,elevation,full_address,zip,district_no,district_desc,nid,neighborhood,rooms,baths,beds,sf,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold,mean_hood_ppsf
0,-122.50965,37.78028,200.83,"2645 El Camino Del Mar, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,8.0,3.5,4,2691.0,,1969,RH2,"RGLR,FNCD","PVDW,PVSW","ATCH,GARG",0,2,4BLK,1BLK,3STR,"PNRM,OCEN,PARK,GRDN",2013,682.521987
1,-122.50929,37.762608,23.21,"1278 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,9.0,3.0,4,2437.0,3000.0,1947,RM1,RGLR,0,"ATCH,GARG,ATDR,INAC",ONST,1,2BLK,1BLK,0,0,2016,675.030776
3,-122.50924,37.77733,189.11,"590 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,5.0,1.0,2,1312.0,,1939,RH1,RGLR,"PVDW,PVSW",GARG,0,2,3BLK,1BLK,"ATAC,2STR,FIXR","WATR,OCEN,PARK",2010,978.150406
4,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,8.0,4.0,4,3307.0,,1951,,RGLR,PVDW,"ATCH,GARG,ATDR,INAC",0,3,4BLK,2BLK,3STR,"PNRM,CTYL,OCEN,PARK",2012,869.972785
5,-122.509186,37.761005,23.69,"1362 La Playa St, San Francisco, CA 94122-1019",94122,2,SF District 2,2030,2 - Outer Sunset,4.0,1.0,3,1168.0,,1947,,"RGLR,LEVL",PVDW,"ATCH,GARG,ATDR",0,2,0,1BLK,ATAC,0,2009,866.438356


In [224]:
final = test.copy()
final['prediction'] = ty_pred
final['difference'] = final['sale_price'] - final['prediction']
final['prediction'] = final['prediction'].astype(int)
final['difference'] = final['difference'].astype(int)
final.head()

Unnamed: 0,longitude,latitude,elevation,full_address,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold,mean_hood_ppsf,prediction,difference
0,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121-2410",94121,1,SF District 1,1050,1 - Outer Richmond,3310000,8.0,4.0,4,3735.0,3959.0,1951,,0,0,"ATCH,GARG,ATDR,INAC",ONST,3,3BLK,1BLK,2STR,"PNRM,CTYL,WATR,SFRN,OCEN,PARK,GRDN,TWNP",2018,887.99643,3321248,-11248
2,-122.509056,37.75801,23.01,"1516 Great Highway, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1830000,7.0,3.0,4,2120.0,3072.0,1948,RM1,"RGLR,OCNF",0,GARG,ONST,1,2BLK,2BLK,FULL,GRDN,2018,796.383648,1845460,-15460
4,-122.50902,37.758636,22.7,"1492 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1300000,3.0,1.0,2,1087.0,1245.0,1944,RM1,0,"PVDW,PVSW",ATCH,ONST,1,1BLK,1BLK,0,0,2018,1232.75069,1388961,-88961
5,-122.50898,37.77368,40.08,"784 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1300000,6.0,1.5,2,1326.0,2696.0,1942,RH2,"RGLR,LEVL,FNCD","PVDW,PVSW",GARG,ONST,2,1BLK,1BLK,"JR,2STR",0,2018,1045.751634,1535014,-235014
7,-122.50827,37.778465,225.66,"525 47th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1865000,6.0,2.0,3,1800.0,3000.0,1931,RH1,RGLR,"PVDW,PVSW","ATCH,GARG,ATDR,INAC",ONST,2,0,1BLK,"FULL,2STR",PRTL,2018,904.62963,1806484,58515


In [225]:
final.shape

(1879, 30)

In [0]:
# Calculate the agent_pred using the mean neighborhood price/sf * square footage of the house
def agent_calc(nhood, sqft):
  a_pred = nhood_dict[nhood] * sqft
  return a_pred

final['pred_percent'] = final['difference']/final['prediction']

In [227]:
final.head()

Unnamed: 0,longitude,latitude,elevation,full_address,zip,district_no,district_desc,nid,neighborhood,sale_price,rooms,baths,beds,sf,lot_sf,year_built,zoning,lot_desc,drive_side,parking,park_leased,num_parking,shopping,transportation,type,views,year_sold,mean_hood_ppsf,prediction,difference,pred_percent
0,-122.50919,37.776695,175.89,"618 48th Ave, San Francisco, CA 94121-2410",94121,1,SF District 1,1050,1 - Outer Richmond,3310000,8.0,4.0,4,3735.0,3959.0,1951,,0,0,"ATCH,GARG,ATDR,INAC",ONST,3,3BLK,1BLK,2STR,"PNRM,CTYL,WATR,SFRN,OCEN,PARK,GRDN,TWNP",2018,887.99643,3321248,-11248,-0.003387
2,-122.509056,37.75801,23.01,"1516 Great Highway, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1830000,7.0,3.0,4,2120.0,3072.0,1948,RM1,"RGLR,OCNF",0,GARG,ONST,1,2BLK,2BLK,FULL,GRDN,2018,796.383648,1845460,-15460,-0.008377
4,-122.50902,37.758636,22.7,"1492 La Playa St, San Francisco, CA 94122",94122,2,SF District 2,2030,2 - Outer Sunset,1300000,3.0,1.0,2,1087.0,1245.0,1944,RM1,0,"PVDW,PVSW",ATCH,ONST,1,1BLK,1BLK,0,0,2018,1232.75069,1388961,-88961,-0.064049
5,-122.50898,37.77368,40.08,"784 48th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1300000,6.0,1.5,2,1326.0,2696.0,1942,RH2,"RGLR,LEVL,FNCD","PVDW,PVSW",GARG,ONST,2,1BLK,1BLK,"JR,2STR",0,2018,1045.751634,1535014,-235014,-0.153102
7,-122.50827,37.778465,225.66,"525 47th Ave, San Francisco, CA 94121",94121,1,SF District 1,1050,1 - Outer Richmond,1865000,6.0,2.0,3,1800.0,3000.0,1931,RH1,RGLR,"PVDW,PVSW","ATCH,GARG,ATDR,INAC",ONST,2,0,1BLK,"FULL,2STR",PRTL,2018,904.62963,1806484,58515,0.032392


In [228]:
pred_five_percent = (final['pred_percent'][(final['pred_percent'] >= -.05) & (final['pred_percent'] <= .05)].count())/final.shape[0]
print('Prediction Error Within 5% -', pred_five_percent)

Prediction Error Within 5% - 0.2368281000532198


In [0]:
# mae = np.sum(np.absolute((final['agent_diff'])))/final['agent_diff'].count()
# print(mae)

In [0]:
# data = final[['full_address', 'neighborhood', 'nid', 'sale_price', 'prediction', 'difference']]
# data = data.drop_duplicates()

# data.head()

In [0]:
# data.shape

In [0]:
# data.to_csv('/content/display_data.csv')