In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr

import itertools
import env
import wrangle
import acquire
import prepare
import split
import exploration as exp
import modeling as md

## Wrangle

In [41]:
zillow, cats, quants  = wrangle.wrangle_zillow()

df = zillow.drop(columns='taxamount')
print(df)
target_var = 'taxvaluedollarcnt'
df.latitude = df.latitude / 1000000
df.longitude = df.longitude /1000000

       parcelid  bedroomcnt  bathroomcnt  sqr_ft  taxvaluedollarcnt  \
0      14297519         4.0          3.5  3100.0          1023282.0   
1      17052889         2.0          1.0  1465.0           464000.0   
2      14186244         3.0          2.0  1243.0           564778.0   
3      12177905         4.0          3.0  2376.0           145143.0   
4      12095076         4.0          3.0  2962.0           773303.0   
...         ...         ...          ...     ...                ...   
52437  11000655         2.0          2.0  1286.0           354621.0   
52438  17239384         4.0          2.0  1612.0            67205.0   
52439  12773139         3.0          1.0  1032.0            49546.0   
52440  12826780         3.0          2.0  1762.0           522000.0   
52441  13083743         3.0          2.0  1050.0           259334.0   

       yearbuilt         county    latitude    longitude  
0         1998.0  orange_county  33634931.0 -117869207.0  
1         1967.0        ventu

## Overall Model

In [39]:
all_counties = md.LRM(df, target_var, county='all')
#baseline
print()
print(f'Baseline train:    ${round(round(all_counties.rmse_train_mean_bl,2), -3)}          Train BL normalized:{all_counties.normalized_rmse_train}')
print(f'Baseline validate: ${round(round(all_counties.rmse_val_mean_bl,2), -3)}       Validate BL normalized:{all_counties.normalized_rmse_val}')

all_counties.x_train_scaled.head()


Baseline train:    $414000.0          Train BL normalized:0.89
Baseline validate: $417000.0       Validate BL normalized:0.89


Unnamed: 0,scaled_bedroomcnt,scaled_bathroomcnt,scaled_sqr_ft,scaled_yearbuilt,dist_from_la,dist_from_long_beach,dist_santa_monica,dist_from_malibu,dist_from_bel_air,dist_balboa_island,dist_laguna_beach,dist_seal_beach,dist_simi,dist_ventura,dist_ojai,dist_eleanor,scaled_OC,scaled_ventura
0,0.0,0.0,-0.032258,-0.142857,0.660602,-0.303763,0.63906,0.662262,0.748439,-0.924861,-0.848921,-0.537515,0.798256,0.739388,0.866301,0.702908,1.0,0.0
1,1.0,1.0,1.22379,0.642857,1.980096,2.281499,0.894339,-0.208093,0.901017,1.719159,1.669648,2.22251,-0.487593,-2.33026,-1.906196,-0.642817,0.0,1.0
2,0.0,-1.0,-0.591734,-0.392857,0.483456,-0.341801,0.547788,0.61306,0.632216,-0.760312,-0.73903,-0.568501,0.711347,0.687555,0.793307,0.641547,1.0,0.0
3,1.0,1.0,0.876008,0.928571,0.01361,0.75218,-0.262595,-0.395752,-0.463421,0.719806,0.707754,0.795377,-0.764877,-0.672956,-0.7192,-0.59546,0.0,0.0
4,0.0,1.0,0.987903,0.857143,0.06535,0.041012,0.365409,0.505612,0.327095,-0.140311,-0.179081,-0.08159,0.422241,0.496224,0.516489,0.460756,0.0,0.0


In [29]:
# default
all_counties.lassolars_regression(use_rfe_features=True)
all_counties.loop_OLS_regression()
all_counties.tweedie(use_rfe_features=True)
all_counties.poly_regression()
all_counties.poly_regression(degree=3)
print()




In [30]:
# all models up to this point
all_counties.all_models_df()

Unnamed: 0,model_name,county,rmse_train,rmse_validate,power,alpha,percent_diff,norm_rmse_train,norm_rmse_validate
0,OLS,all,341414.615246,343192.507381,,,-0.52,0.7339,0.7339
1,OLS,all,340809.554561,342725.620251,,,-0.56,0.7326,0.7329
2,OLS,all,335228.839273,337916.572164,,,-0.8,0.7206,0.7226
3,OLS,all,333507.977989,335858.526955,,,-0.7,0.7169,0.7182
4,OLS,all,332685.698938,334779.195126,,,-0.63,0.7151,0.7159
5,OLS,all,331368.812068,333101.884785,,,-0.52,0.7123,0.7123
6,OLS,all,331079.130197,332744.887036,,,-0.5,0.7116,0.7116
7,OLS,all,322524.775662,324290.090123,,,-0.55,0.6933,0.6935
8,OLS,all,322125.676812,323721.090889,,,-0.5,0.6924,0.6923
9,OLS,all,319578.460539,320550.659901,,,-0.3,0.6869,0.6855


## LA County

In [6]:
los_angeles = df[df.county=='los_angeles']
los_angeles = md.LRM(los_angeles, target_var, county='la')
print()
print(f'Baseline train: {los_angeles.rmse_train_mean_bl}             Train BL normalized:{los_angeles.normalized_rmse_train}')
print(f'Baseline validate: {los_angeles.rmse_val_mean_bl}       Validate BL normalized:{los_angeles.normalized_rmse_val}')


Baseline train: 417161.1024398403             Train BL normalized:0.97
Baseline validate: 405392.9903419076       Validate BL normalized:0.96


In [7]:
los_angeles.lassolars_regression(use_rfe_features=True)
los_angeles.loop_OLS_regression()
los_angeles.tweedie(use_rfe_features=True)
los_angeles.poly_regression()
los_angeles.poly_regression(degree=3)
print()




In [8]:
los_angeles.all_models_df()

Unnamed: 0,model_name,county,rmse_train,rmse_validate,power,alpha,percent_diff,norm_rmse_train,norm_rmse_validate
0,OLS,la,356740.281718,352140.844123,,,1.29,0.8298,0.8315
1,OLS,la,356331.687845,351657.948043,,,1.31,0.8288,0.8304
2,OLS,la,331993.373524,329364.998672,,,0.79,0.7722,0.7778
3,OLS,la,331362.318758,328353.331785,,,0.91,0.7707,0.7754
4,OLS,la,330383.294533,327929.17632,,,0.74,0.7685,0.7744
5,OLS,la,327910.397459,325284.700271,,,0.8,0.7627,0.7681
6,OLS,la,325576.173863,323928.841556,,,0.51,0.7573,0.7649
7,OLS,la,325459.379328,323991.545901,,,0.45,0.757,0.7651
8,OLS,la,325417.293233,324046.401143,,,0.42,0.7569,0.7652
9,lasso_lars,la,325428.813368,323975.868225,,1.0,0.45,0.7569,0.765


## Orange County

In [36]:
# Initialize subset and create LRM object with baseline metrics
orange_county = df[df.county=='orange_county']
orange_county = md.LRM(orange_county, target_var, county='oc')
print()
print(f'Baseline train RMSE:    ${round(orange_county.rmse_train_mean_bl,2)}          Train BL normalized:{orange_county.normalized_rmse_train}')
print(f'Baseline validate RMSE: ${round(orange_county.rmse_val_mean_bl, 2)}       Validate BL normalized:{orange_county.normalized_rmse_val}')

orange_county.x_train_scaled.head()


Baseline train RMSE:    $419338.44          Train BL normalized:0.76
Baseline validate RMSE: $433694.14       Validate BL normalized:0.79


Unnamed: 0,scaled_bedroomcnt,scaled_bathroomcnt,scaled_sqr_ft,scaled_yearbuilt,dist_balboa_island,dist_laguna_beach,dist_seal_beach
0,-1.0,0.0,0.854404,0.68,0.541502,-0.442197,1.137063
1,-1.0,-1.5,-0.431766,-0.84,1.262835,1.147149,-0.027133
2,0.0,-0.5,-0.565952,-0.28,-1.453404,-0.381292,-0.2884
3,0.0,-1.0,-0.649019,-0.72,-0.257209,0.266236,-0.422907
4,0.0,0.5,0.745778,0.4,0.197193,-0.646522,0.964482


In [10]:
# different regression models
orange_county.lassolars_regression(use_rfe_features=True)
orange_county.loop_OLS_regression()
orange_county.tweedie(use_rfe_features=True)
orange_county.poly_regression()
orange_county.poly_regression(degree=3)
print()




In [17]:
# models using only orange county
orange_county.all_models_df().head()

Unnamed: 0,model_name,county,rmse_train,rmse_validate,power,alpha,percent_diff,norm_rmse_train,norm_rmse_validate
0,OLS,oc,330923.388749,332958.519923,,,-0.61,0.6032,0.6081
1,OLS,oc,329745.558228,330888.964901,,,-0.35,0.6011,0.6043
2,OLS,oc,327382.635448,327098.152545,,,0.09,0.5968,0.5974
3,OLS,oc,318991.328587,319195.929946,,,-0.06,0.5815,0.583
4,OLS,oc,315685.247651,316186.75796,,,-0.16,0.5755,0.5775


In [12]:
orange_county.x_train_scaled.head()

Unnamed: 0,scaled_bedroomcnt,scaled_bathroomcnt,scaled_sqr_ft,scaled_yearbuilt,dist_balboa_island,dist_laguna_beach,dist_seal_beach
0,-1.0,0.0,0.854404,0.68,0.541502,-0.442197,1.137063
1,-1.0,-1.5,-0.431766,-0.84,1.262835,1.147149,-0.027133
2,0.0,-0.5,-0.565952,-0.28,-1.453404,-0.381292,-0.2884
3,0.0,-1.0,-0.649019,-0.72,-0.257209,0.266236,-0.422907
4,0.0,0.5,0.745778,0.4,0.197193,-0.646522,0.964482


## Ventura

In [25]:
ventura = df[df.county=='ventura']
ventura = md.LRM(ventura, target_var, county='vent')
print()
print(f'Baseline train: {ventura.rmse_train_mean_bl}              Train BL normalized:{ventura.normalized_rmse_train}')
print(f'Baseline validate: {ventura.rmse_val_mean_bl}       Validate BL normalized:{ventura.normalized_rmse_val}')


Baseline train: 313827.1058231729              Train BL normalized:0.65
Baseline validate: 316290.20891438174       Validate BL normalized:0.65


In [26]:
ventura.lassolars_regression(use_rfe_features=True)
ventura.loop_OLS_regression()
ventura.tweedie(use_rfe_features=True)
ventura.poly_regression()
ventura.poly_regression(degree=3)
print()




In [27]:
ventura.all_models_df()

Unnamed: 0,model_name,county,rmse_train,rmse_validate,power,alpha,percent_diff,norm_rmse_train,norm_rmse_validate
0,OLS,vent,224110.923191,217038.794345,,,3.16,0.4621,0.448
1,OLS,vent,223967.928742,217283.686122,,,2.98,0.4619,0.4485
2,OLS,vent,222359.059048,215953.737139,,,2.88,0.4585,0.4458
3,OLS,vent,218839.528736,211978.767899,,,3.14,0.4513,0.4375
4,OLS,vent,218590.360007,212315.884416,,,2.87,0.4508,0.4382
5,OLS,vent,218434.561228,212290.905025,,,2.81,0.4504,0.4382
6,OLS,vent,218425.732652,212435.947924,,,2.74,0.4504,0.4385
7,OLS,vent,215136.010414,209131.082365,,,2.79,0.4436,0.4317
8,lasso_lars,vent,215136.508805,209143.14639,,1.0,2.79,0.4436,0.4317
9,tweedie,vent,207248.117746,199819.60063,1.0,0.0,3.58,0.4274,0.4125
