In [1]:
# imports:

import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns


from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor


from prepare import zillow_pipeline
from model import split_scale_tvt, xy_split, eval_model, train_model
from visuals import plot_value_distribution, area_vs_value_plt, age_vs_value_plt, bedr_vs_value_plt, bathr_vs_value_plt, county_vs_value_plt

In [2]:
df = zillow_pipeline()
df.head()

Unnamed: 0,bedrooms,bathrooms,area,value,year,county,state
0,4,3.5,3100,1023282,1998,Orange,CA
1,2,1.0,1465,464000,1967,Ventura,CA
2,3,2.0,1243,564778,1962,Orange,CA
3,4,3.0,2376,145143,1970,Los Angeles,CA
4,4,3.0,2962,773303,1950,Los Angeles,CA


In [3]:
df.drop(columns=['county','state'], inplace=True)
df.head()

Unnamed: 0,bedrooms,bathrooms,area,value,year
0,4,3.5,3100,1023282,1998
1,2,1.0,1465,464000,1967
2,3,2.0,1243,564778,1962
3,4,3.0,2376,145143,1970
4,4,3.0,2962,773303,1950


In [4]:
train, val, test = split_scale_tvt(df)
train.head()

Unnamed: 0,bedrooms,bathrooms,area,value,year
20022,0.153846,0.058824,0.074976,169189,0.5
22947,0.307692,0.147059,0.171359,1193666,0.666667
18000,0.153846,0.0,0.043402,35983,0.355072
10496,0.153846,0.088235,0.099951,657564,0.775362
41940,0.230769,0.176471,0.113001,1096680,0.789855


In [5]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)
X_train.shape, X_val.shape

((36572, 4), (7837, 4))

In [6]:
X_train = pd.get_dummies(X_train)
X_val = pd.get_dummies(X_val)
X_train.shape, X_val.shape

((36572, 4), (7837, 4))

In [7]:
# X_train = pd.get_dummies(X_train, columns=['county'])
# X_val = pd.get_dummies(X_val, columns=['county'])
# X_train.shape, X_val.shape

In [8]:
baselines = pd.DataFrame({'y_actual': y_train,
                          'y_mean': y_train.mean(),
                          'y_median': y_train.median()})

baselines.head()

Unnamed: 0,y_actual,y_mean,y_median
20022,169189,527735.890627,372957.0
22947,1193666,527735.890627,372957.0
18000,35983,527735.890627,372957.0
10496,657564,527735.890627,372957.0
41940,1096680,527735.890627,372957.0


In [9]:
eval_model(baselines.y_actual, baselines.y_mean)

755431.5736908859

In [10]:
eval_model(baselines.y_actual, baselines.y_median)

771124.7418626399

<div class="alert alert-block alert-info">

Models without Polynomial Features


In [12]:
lm = LinearRegression()
train_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 586093.7900878498.
The validate RMSE is 559798.7087501992.


In [13]:
ll = LassoLars(alpha=0)

train_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 586093.7900878498.
The validate RMSE is 559798.7087501992.


In [17]:
tweedie = TweedieRegressor()

train_model(tweedie, X_train, y_train, X_val, y_val)

The train RMSE is 753183.7961624659.
The validate RMSE is 738133.8240400734.


In [18]:
rf = RandomForestRegressor()

train_model(rf, X_train, y_train, X_val, y_val)

The train RMSE is 238347.52961674868.
The validate RMSE is 567826.8022186869.


In [19]:
xgbr = XGBRegressor()

train_model(xgbr, X_train, y_train, X_val, y_val)

The train RMSE is 359555.25092336204.
The validate RMSE is 561749.0046468886.


<div class="alert alert-block alert-info">

Models WITH Polynomial Features


In [20]:
poly = PolynomialFeatures()
X_train_s = poly.fit_transform(X_train)
X_val_s = poly.transform(X_val)

In [21]:
len(X_train_s[0])

15

In [25]:
lm = LinearRegression()
train_model(lm, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 535895.2478522771.
The validate RMSE is 546198.7157796688.


In [None]:
ll = LassoLars(alpha=0)

train_model(ll, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 586093.7900878498.
The validate RMSE is 559798.7087501992.


In [26]:
tweedie = TweedieRegressor()

train_model(tweedie, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 750686.6592923255.
The validate RMSE is 735648.4259873163.


In [27]:
rf = RandomForestRegressor()

train_model(rf, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 232202.8919234464.
The validate RMSE is 568751.1956605418.


In [28]:
xgbr = XGBRegressor()

train_model(xgbr, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 329720.01898907626.
The validate RMSE is 619332.7320340935.
