In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import normaltest, kstest, norm
import matplotlib.pyplot as plt
import seaborn as sns
import statistics

#Sklearn
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_squared_error

from acquire import wrangle_zillow, train_validate, check_p_val, scale_zillow, find_baseline, train_val_test

In [2]:
#Sets options to show more stuff
pd.options.display.max_columns = None
pd.options.display.width = 100
pd.options.display.max_colwidth = None
pd.options.display.max_rows = 200

In [3]:
random_seed = 1969
alpha = 0.05
target_col = 'tax_value'

## Modeling <a class="anchor" id="modeling"></a>

### Introduction <a class="anchor" id="m_intro"></a>

Splitting the data prior to creating the dummy columns resulted in an inconsistant number of columns. This notebook was created as a result. Final solution was to create dummy columns from the origional data and reassign the 'train', 'val' and 'test' DataFrames with the new data.

### Preparing the data for modeling

In [4]:
df = wrangle_zillow()

In [5]:
df = df.drop(columns='transactiondate')

#### Creating dummy columns for catagorical data

In [6]:
#Create dummy columns of my categorical features
df = pd.get_dummies(columns=['bedrooms', 'bathrooms'], data=df)
#train.head()

In [7]:
train, val, test = train_validate(df)

#### Scaling the columns with continuous data

In [8]:
train, val, test = scale_zillow(train, val, test, scaler_model = 1, cont_columns = ['sqft'])

In [9]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test(train, val, test, target_col)

### Baseline <a class="anchor" id="baseline"></a>

In [10]:
#Eastablishes the standard to beat
baseline = find_baseline(y_train)

RMSE Mean training baseline: 369,346
*****************************************
RMSE Median training baseline: 379,222


In [11]:
round(y_train.describe(),0)

count      28389.0
mean      446632.0
std       369352.0
min         9242.0
25%       186564.0
50%       360647.0
75%       586162.0
max      2381729.0
Name: tax_value, dtype: float64

In [12]:
#metrics
rmse_scores = []

### Model 1 <a class="anchor" id="mod_1"></a>

#### LinearRegression(OLS)

In [13]:
lm = LinearRegression(normalize=True)

In [14]:
lm.fit(X_train, y_train)

LinearRegression(normalize=True)

In [15]:
lm_preds = pd.DataFrame({'actual':y_train})

In [16]:
lm_preds['pred_lm'] = lm.predict(X_train)

In [17]:
lm_val_preds = pd.DataFrame({'actual':y_val})

In [18]:
lm_val_preds['lm_val_preds'] = lm.predict(X_val)

In [19]:
lm_val_preds.head()

Unnamed: 0,actual,lm_val_preds
32923,89544.0,285184.0
8931,525000.0,766976.0
44110,980125.0,595456.0
33210,581186.0,399872.0
2026,573616.0,350720.0


In [20]:
rmse_train = mean_squared_error(lm_preds['actual'], lm_preds['pred_lm'], squared=False) 

In [21]:
rmse_val = mean_squared_error(lm_val_preds['actual'], lm_val_preds['lm_val_preds'], squared=False) 

In [22]:
rmse_scores.append({'Model':'OLS Linear Regression',
                    'RMSE on Train': rmse_train,
                    'RMSE on Validate': rmse_val})

In [23]:
rmse_scores

[{'Model': 'OLS Linear Regression',
  'RMSE on Train': 300476.7586401175,
  'RMSE on Validate': 299644.9906544258}]

### Model 2 <a class="anchor" id="mod_2"></a>

#### LassoLars

In [24]:
lars = LassoLars(alpha=.01)

In [25]:
lars.fit(X_train, y_train)

LassoLars(alpha=0.01)

In [26]:
ll_preds = pd.DataFrame({'actual':y_train})

In [27]:
ll_preds['pred_ll'] = lars.predict(X_train)

In [28]:
ll_val_preds = pd.DataFrame({'actual':y_val})

In [29]:
ll_val_preds['ll_val_preds'] = lars.predict(X_val)

In [30]:
rmse_train = mean_squared_error(ll_preds['actual'], ll_preds['pred_ll'], squared=False) 

In [31]:
rmse_val = mean_squared_error(ll_val_preds['actual'], ll_val_preds['ll_val_preds'], squared=False) 

In [32]:
rmse_scores.append({'Model':'Lasso Lars',
                    'RMSE on Train': rmse_train,
                    'RMSE on Validate': rmse_val})

In [33]:
rmse_scores

[{'Model': 'OLS Linear Regression',
  'RMSE on Train': 300476.7586401175,
  'RMSE on Validate': 299644.9906544258},
 {'Model': 'Lasso Lars',
  'RMSE on Train': 300469.5811996958,
  'RMSE on Validate': 299639.1281171493}]

### Model 3 <a class="anchor" id="mod_3"></a>

#### TweedieRegressor (GLM)

In [34]:
glm = TweedieRegressor(power=1, alpha=1)

In [35]:
glm.fit(X_train, y_train)

TweedieRegressor(alpha=1, power=1)

In [36]:
glm_preds = pd.DataFrame({'actual':y_train})

In [37]:
glm_preds['pred_glm'] = glm.predict(X_train)

In [38]:
glm_val_preds = pd.DataFrame({'actual':y_val})

In [39]:
glm_val_preds['glm_val_preds'] = glm.predict(X_val)

In [40]:
rmse_train = mean_squared_error(glm_preds['actual'], glm_preds['pred_glm'], squared=False) 

In [41]:
rmse_val = mean_squared_error(glm_val_preds['actual'], glm_val_preds['glm_val_preds'], squared=False) 

In [42]:
rmse_scores.append({'Model':'TweedieRegressor',
                    'RMSE on Train': rmse_train,
                    'RMSE on Validate': rmse_val})

In [43]:
rmse_scores

[{'Model': 'OLS Linear Regression',
  'RMSE on Train': 300476.7586401175,
  'RMSE on Validate': 299644.9906544258},
 {'Model': 'Lasso Lars',
  'RMSE on Train': 300469.5811996958,
  'RMSE on Validate': 299639.1281171493},
 {'Model': 'TweedieRegressor',
  'RMSE on Train': 302024.4373588197,
  'RMSE on Validate': 301787.1820815464}]

### Model 4 <a class="anchor" id="mod_4"></a>

#### Polynomial Regression

In [44]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=1)

In [45]:
# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train)

In [46]:
# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_val)
X_test_degree2 = pf.transform(X_test)

#### Linear Regression

In [47]:
lm2 = LinearRegression(normalize=True)

In [48]:
lm2.fit(X_train_degree2, y_train)

LinearRegression(normalize=True)

In [49]:
lm2_preds = pd.DataFrame({'actual':y_train})

In [50]:
lm2_preds['pred_lm2'] = lm2.predict(X_train_degree2)

In [51]:
lm2_val_preds = pd.DataFrame({'actual':y_val})

In [52]:
lm2_val_preds['lm2_val_preds'] = lm2.predict(X_validate_degree2)

In [53]:
rmse_train = mean_squared_error(lm2_preds['actual'], lm2_preds['pred_lm2'], squared=False) 

In [54]:
rmse_val = mean_squared_error(lm2_val_preds['actual'], lm2_val_preds['lm2_val_preds'], squared=False) 

In [55]:
rmse_scores.append({'Model':'Polynomial Regression',
                    'RMSE on Train': rmse_train,
                    'RMSE on Validate': rmse_val})

In [56]:
rmse_scores

[{'Model': 'OLS Linear Regression',
  'RMSE on Train': 300476.7586401175,
  'RMSE on Validate': 299644.9906544258},
 {'Model': 'Lasso Lars',
  'RMSE on Train': 300469.5811996958,
  'RMSE on Validate': 299639.1281171493},
 {'Model': 'TweedieRegressor',
  'RMSE on Train': 302024.4373588197,
  'RMSE on Validate': 301787.1820815464},
 {'Model': 'Polynomial Regression',
  'RMSE on Train': 300477.44983873144,
  'RMSE on Validate': 299687.2503200106}]