<div class="alert alert-block alert-warning">

# Modeling Exercises

In [1]:
# Imports

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import time

#from scipy import stats
#from math import sqrt

from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import explained_variance_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

from wrangle import zillow_pipeline, split_train_val_test, scale_train_val_test, xy_split, scale_train_val_test2

from explore import plot_variable_pairs, plot_categorical_and_continuous_vars

from evaluate import create_model, dataframe_model, plot_residuals, regression_errors, baseline_mean_errors, better_than_baseline

from model import eval_model, train_model, estimate_t_time

from features import make_features

import os

<div class="alert alert-block alert-success">

Do your work for this exercise in a jupyter notebook named modeling within the regression-exercises repo. Add, commit, and push your work.


<div class="alert alert-block alert-info">

1. Select a dataset with a continuous target variable.


In [2]:
df = zillow_pipeline()
df.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,fips
4,4,2.0,3633,296425,2005,6037
6,3,4.0,1620,847770,2011,6037
7,3,2.0,2077,646760,1926,6037
18,3,1.0,1244,169471,1950,6037
19,3,2.0,1300,233266,1950,6037


<div class="alert alert-block alert-info">

2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.


In [3]:
train, val, test = split_train_val_test(df)

train.shape, val.shape, test.shape

((1494436, 6), (320236, 6), (320237, 6))

In [4]:
train, val, test =scale_train_val_test(train, val, test)

<div class="alert alert-block alert-info">

3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [5]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)

In [6]:
X_train = pd.get_dummies(X_train, columns=['fips'], prefix='county')
X_val = pd.get_dummies(X_val, columns=['fips'], prefix='county')
X_train.shape, X_val.shape

((1494436, 7), (320236, 7))

In [7]:
X_train.head()

Unnamed: 0,bedrooms,bathrooms,area,year_built,county_6037,county_6059,county_6111
177512,3,2.0,0.00131,0.693023,True,False,False
1273481,4,4.0,0.002767,0.962791,True,False,False
1374916,4,3.0,0.002647,0.865116,True,False,False
1754843,4,2.0,0.001553,0.739535,False,True,False
435736,5,3.0,0.002611,0.693023,True,False,False


In [8]:
y_train.mean(), y_train.median()

(460730.7356527814, 328884.0)

In [9]:
baselines = pd.DataFrame({'y_actual': y_train,
                          'y_mean': y_train.mean(),
                          'y_median': y_train.median()})

baselines.head()

Unnamed: 0,y_actual,y_mean,y_median
177512,414000,460730.735653,328884.0
1273481,896029,460730.735653,328884.0
1374916,365817,460730.735653,328884.0
1754843,191464,460730.735653,328884.0
435736,376667,460730.735653,328884.0


In [12]:
eval_model(baselines.y_actual, baselines.y_mean)

663705.714944513

In [13]:
eval_model(baselines.y_actual, baselines.y_median)

676674.839012285

We are going to evaluate our models using RMSE. Our baseline is 663705.714944513 using the mean.

In [14]:
lm = LinearRegression()

In [15]:
train_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 549870.5131981942.
The validate RMSE is 572804.2379270114.


[LASSO LARS](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html) is next. Let's play around with alpha.

In [16]:
ll = LassoLars(alpha=0)

train_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 549870.3875105121.
The validate RMSE is 572809.646382811.


Let's do some [polynomial regression](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html) next.

In [17]:
poly = PolynomialFeatures()
X_train_s = poly.fit_transform(X_train)
X_val_s = poly.transform(X_val)

In [18]:
lm = LinearRegression()

train_model(lm, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 483994.6118987415.
The validate RMSE is 505541.6334893075.


The [TweedieRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.TweedieRegressor.html) is the most flexible algorithm from the curriculum.

In [30]:
tweedie = TweedieRegressor()

train_model(tweedie, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 530425.9403463455.
The validate RMSE is 554858.4469507411.


Let's have some fun with the [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) next.

first on the Original X_train

In [31]:
rf = RandomForestRegressor()

train_model(rf, X_train, y_train, X_val, y_val)

The train RMSE is 29227.908420992466.
The validate RMSE is 90497.76511628271.


In [None]:
rf = RandomForestRegressor()

train_model(rf, X_train_s, y_train, X_val_s, y_val)

!pip install xgboost

Final, a little [xgboost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor) to finish things off.

In [19]:
xgbr = XGBRegressor()

train_model(xgbr, X_train, y_train, X_val, y_val)

The train RMSE is 422935.91958692373.
The validate RMSE is 511855.34531351126.


Now on the poly X_train_s

In [84]:
estimate_full_training_time(rf, X_train_s, y_train, X_val_s, y_val, subsample_fraction=0.005)

The train RMSE is 83816.13514616969.
The validate RMSE is 96208.43081623212.


'00:49'

<div class="alert alert-block alert-info">

4. Bring in the features created before to see if it improved the model


In [2]:
df = zillow_pipeline()
df.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,fips
4,4,2.0,3633,296425,2005,6037
6,3,4.0,1620,847770,2011,6037
7,3,2.0,2077,646760,1926,6037
18,3,1.0,1244,169471,1950,6037
19,3,2.0,1300,233266,1950,6037


In [3]:
df = make_features(df)
df.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,total_rooms,property_age,county_6037,county_6059,county_6111,size_per_bedroom,bathroom_to_bedroom_ratio,property_age_group,property_size_category
4,4,2.0,3633,296425,2005,6.0,18,True,False,False,908.25,0.5,new,large
6,3,4.0,1620,847770,2011,7.0,12,True,False,False,540.0,1.333333,very_new,medium
7,3,2.0,2077,646760,1926,5.0,97,True,False,False,692.333333,0.666667,old,medium
18,3,1.0,1244,169471,1950,4.0,73,True,False,False,414.666667,0.333333,mid-aged,medium
19,3,2.0,1300,233266,1950,5.0,73,True,False,False,433.333333,0.666667,mid-aged,medium


In [4]:
df = pd.get_dummies(df, columns=['property_size_category', 'property_age_group'])
df.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,total_rooms,property_age,county_6037,county_6059,county_6111,size_per_bedroom,bathroom_to_bedroom_ratio,property_size_category_small,property_size_category_medium,property_size_category_large,property_age_group_very_new,property_age_group_new,property_age_group_mid-aged,property_age_group_old,property_age_group_very_old
4,4,2.0,3633,296425,2005,6.0,18,True,False,False,908.25,0.5,False,False,True,False,True,False,False,False
6,3,4.0,1620,847770,2011,7.0,12,True,False,False,540.0,1.333333,False,True,False,True,False,False,False,False
7,3,2.0,2077,646760,1926,5.0,97,True,False,False,692.333333,0.666667,False,True,False,False,False,False,True,False
18,3,1.0,1244,169471,1950,4.0,73,True,False,False,414.666667,0.333333,False,True,False,False,False,True,False,False
19,3,2.0,1300,233266,1950,5.0,73,True,False,False,433.333333,0.666667,False,True,False,False,False,True,False,False


In [23]:
df = df.sample(frac=0.2, random_state=42)

In [24]:
train, val, test = split_train_val_test(df)

train.shape, val.shape, test.shape

((298887, 20), (64047, 20), (64048, 20))

In [25]:
train, val, test =scale_train_val_test2(train, val, test)

In [32]:
train.columns

Index(['bedrooms', 'bathrooms', 'area', 'tax_value', 'year_built',
       'total_rooms', 'property_age', 'county_6037', 'county_6059',
       'county_6111', 'size_per_bedroom', 'bathroom_to_bedroom_ratio',
       'property_size_category_small', 'property_size_category_medium',
       'property_size_category_large', 'property_age_group_very_new',
       'property_age_group_new', 'property_age_group_mid-aged',
       'property_age_group_old', 'property_age_group_very_old'],
      dtype='object')

In [26]:
train.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,total_rooms,property_age,county_6037,county_6059,county_6111,size_per_bedroom,bathroom_to_bedroom_ratio,property_size_category_small,property_size_category_medium,property_size_category_large,property_age_group_very_new,property_age_group_new,property_age_group_mid-aged,property_age_group_old,property_age_group_very_old
1461410,0.266667,0.102564,0.069842,347876,0.63388,0.183333,0.36612,0.0,1.0,0.0,0.046293,0.045455,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
969587,0.2,0.230769,0.117558,2475000,0.857923,0.233333,0.142077,0.0,0.0,1.0,0.097405,0.140909,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
33412,0.066667,0.025641,0.02559,60629,0.639344,0.033333,0.360656,1.0,0.0,0.0,0.042432,0.045455,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1204860,0.133333,0.076923,0.03886,150669,0.781421,0.1,0.218579,0.0,1.0,0.0,0.042941,0.066667,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1200175,0.066667,0.025641,0.023355,124306,0.644809,0.033333,0.355191,1.0,0.0,0.0,0.038729,0.045455,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [27]:
X_train, y_train = xy_split(train)
X_val, y_val = xy_split(val)

In [28]:
X_train.head()

Unnamed: 0,bedrooms,bathrooms,area,year_built,total_rooms,property_age,county_6037,county_6059,county_6111,size_per_bedroom,bathroom_to_bedroom_ratio,property_size_category_small,property_size_category_medium,property_size_category_large,property_age_group_very_new,property_age_group_new,property_age_group_mid-aged,property_age_group_old,property_age_group_very_old
1461410,0.266667,0.102564,0.069842,0.63388,0.183333,0.36612,0.0,1.0,0.0,0.046293,0.045455,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
969587,0.2,0.230769,0.117558,0.857923,0.233333,0.142077,0.0,0.0,1.0,0.097405,0.140909,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
33412,0.066667,0.025641,0.02559,0.639344,0.033333,0.360656,1.0,0.0,0.0,0.042432,0.045455,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1204860,0.133333,0.076923,0.03886,0.781421,0.1,0.218579,0.0,1.0,0.0,0.042941,0.066667,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1200175,0.066667,0.025641,0.023355,0.644809,0.033333,0.355191,1.0,0.0,0.0,0.038729,0.045455,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [29]:
baselines = pd.DataFrame({'y_actual': y_train,
                          'y_mean': y_train.mean(),
                          'y_median': y_train.median()})

baselines.head()

Unnamed: 0,y_actual,y_mean,y_median
1461410,347876,460701.683014,328920.0
969587,2475000,460701.683014,328920.0
33412,60629,460701.683014,328920.0
1204860,150669,460701.683014,328920.0
1200175,124306,460701.683014,328920.0


In [30]:
eval_model(baselines.y_actual, baselines.y_mean)

672959.3473815045

In [31]:
eval_model(baselines.y_actual, baselines.y_median)

685740.9825919245

In [32]:
lm = LinearRegression()

In [33]:
train_model(lm, X_train, y_train, X_val, y_val)

The train RMSE is 511018.28577097825.
The validate RMSE is 579772.8473757592.


In [34]:
ll = LassoLars(alpha=0)

train_model(ll, X_train, y_train, X_val, y_val)

The train RMSE is 511018.98734197556.
The validate RMSE is 579763.2326655082.


In [35]:
poly = PolynomialFeatures()
X_train_s = poly.fit_transform(X_train)
X_val_s = poly.transform(X_val)

In [36]:
len(X_train_s[0])

210

In [37]:
X_train.shape

(298887, 19)

In [38]:
lm = LinearRegression()

train_model(lm, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 478820.83399433625.
The validate RMSE is 577739.0449383066.


In [39]:
rf = RandomForestRegressor()

train_model(rf, X_train_s, y_train, X_val_s, y_val)

The train RMSE is 220783.99866403.
The validate RMSE is 595587.931411244.


In [40]:
xgbr = XGBRegressor()

train_model(xgbr, X_train, y_train, X_val, y_val)

The train RMSE is 374791.9932221708.
The validate RMSE is 685190.1691635076.
