<a id="TableOfContents"></a>
# TABLE OF CONTENTS:
<li><a href='#imports'>Imports</a></li>
<li><a href='#setup'>Setup</a></li>
<li><a href='#lr'>Linear Regression</a></li>
<li><a href="#ll">LassoLars</a></li>
<li><a href='#tdr'>Tweedie Regressor</a></li>
<li><a href='#pnr'>Polynomial Regression</a></li>
<li><a href='#top'>Top Model</a></li>
<li><a href='#extra'>Extra</a></li>

<a id='imports'></a>
# Imports:
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [1]:
# Vectorization and tables
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Datasets
from pydataset import data

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoLars
from sklearn.linear_model import TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

# .py files
import wrangle

<a id='setup'></a>
# Setup:
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [2]:
# Acquire, prep, and split zillow dataframe
train, validate, test = wrangle.wrangle_zillow_mvp()

train.shape:(29366, 10)
validate.shape:(12586, 10)
test.shape:(10489, 10)


In [3]:
# Identify columns to scale then scale dataframes
scale_cols = [
    'bedrooms',
    'home_sqft',
    'full_bathrooms',
    'lotsize_sqft',
    'home_age',
    'home_lot_ratio'
]

# Scale columns using robustscaler for dataframes
train_scale, validate_scale, test_scale = wrangle.scale(train, validate, test, scale_cols, RobustScaler())
train_scale.sample()

Unnamed: 0,bedrooms,home_sqft,full_bathrooms,lotsize_sqft,home_age,value,home_lot_ratio,county_Los Angeles,county_Orange,county_Ventura
19172,-1.0,-0.229572,-1.0,-0.210041,1.068966,636366.0,0.0,1,0,0


In [4]:
# Identify features for machine learning
features = [
    'bedrooms',
    'home_sqft',
    'full_bathrooms',
    'lotsize_sqft',
    'home_age',
    'home_lot_ratio'
]

In [5]:
features

['bedrooms',
 'home_sqft',
 'full_bathrooms',
 'lotsize_sqft',
 'home_age',
 'home_lot_ratio']

In [6]:
# Define x/y cols for train, validate, test
x_train = train_scale[features]
y_train = pd.DataFrame(train.value)
x_validate = validate_scale[features]
y_validate = pd.DataFrame(validate.value)
x_test = test_scale[features]
y_test = pd.DataFrame(test.value)

In [7]:
# Establish mean or median baseline
mean_baseline = round(train.value.mean(), 2)
y_train['mean_baseline'] = mean_baseline
y_validate['mean_baseline'] = mean_baseline
median_baseline = train.value.median()
y_train['median_baseline'] = median_baseline
y_validate['median_baseline'] = median_baseline

# Compare mean/median RMSE scores
rmse_train_mean = mean_squared_error(y_train.value,
                                    y_train.mean_baseline) ** .5
rmse_validate_mean = mean_squared_error(y_validate.value,
                                    y_validate.mean_baseline) ** .5
rmse_train_median = mean_squared_error(y_train.value,
                                    y_train.median_baseline) ** .5
rmse_validate_median = mean_squared_error(y_validate.value,
                                    y_validate.median_baseline) ** .5

print('\033[35m ========== MEAN RMSE SCORES ==========\033[0m')
print(f'\033[32mTrain:\033[0m {rmse_train_mean}\n\033[32mValidate:\033[0m {rmse_validate_mean}\n\033[32mDifference:\033[0m {rmse_validate_mean - rmse_train_mean}')
print('\n\n\033[35m ========== MEDIAN RMSE SCORES ==========\033[0m')
print(f'\033[32mTrain:\033[0m {rmse_train_median}\n\033[32mValidate:\033[0m {rmse_validate_median}\n\033[32mDifference:\033[0m {rmse_validate_median - rmse_train_median}')

[32mTrain:[0m 729581.0514330696
[32mValidate:[0m 849736.225480552
[32mDifference:[0m 120155.17404748232


[32mTrain:[0m 745719.9188036239
[32mValidate:[0m 865036.7172253283
[32mDifference:[0m 119316.79842170444


##### Mean is better

In [8]:
# Create a dictionary of model scores
models_dict = {
    'model_name' : ['baseline_mean'],
    'train_rmse' : [round(rmse_train_mean, 2)],
    'validate_rmse' : [round(rmse_validate_mean, 2)],
    'difference' : [round(rmse_validate_mean - rmse_train_mean, 2)],
    'validate_r2' : [round(explained_variance_score(y_validate.value,
                                             y_validate.mean_baseline), 2)]
}

In [9]:
pd.DataFrame(models_dict)

Unnamed: 0,model_name,train_rmse,validate_rmse,difference,validate_r2
0,baseline_mean,729581.05,849736.23,120155.17,0.0


<a id='lr'></a>
# Linear Regression:
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [10]:
# Create train/validate predictions with lr modeling
lr = LinearRegression()
lr.fit(x_train, y_train.value)
y_train['lr_pred'] = lr.predict(x_train)
y_validate['lr_pred'] = lr.predict(x_validate)

# Get RMSE vals for train/validate
# Append to 'models_dict'
lr_rmse_train = mean_squared_error(y_train.value,
                                   y_train.lr_pred) ** .5
lr_rmse_validate = mean_squared_error(y_validate.value,
                                      y_validate.lr_pred) ** .5
models_dict['model_name'].append('lr')
models_dict['train_rmse'].append(round(lr_rmse_train, 2))
models_dict['validate_rmse'].append(round(lr_rmse_validate, 2))
models_dict['difference'].append(round(lr_rmse_validate - lr_rmse_train, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.value,
                                                                 y_validate.lr_pred), 2))

# Check models_dict
pd.DataFrame(models_dict)

Unnamed: 0,model_name,train_rmse,validate_rmse,difference,validate_r2
0,baseline_mean,729581.05,849736.23,120155.17,0.0
1,lr,568509.46,670163.46,101654.0,0.38


<a id='ll'></a>
# LassoLars:
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [11]:
# Create train/validate predictions with LassoLars
ll1 = LassoLars(alpha=1)
ll1.fit(x_train, y_train.value)
y_train['ll_pred_1.0'] = ll1.predict(x_train)
y_validate['ll_pred_1.0'] = ll1.predict(x_validate)

# Get RMSE values and append to 'models_dict'
ll_rmse_train = mean_squared_error(y_train.value,
                                   y_train['ll_pred_1.0']) ** .5
ll_rmse_validate = mean_squared_error(y_validate.value,
                                      y_validate['ll_pred_1.0']) ** .5
models_dict['model_name'].append('ll_1.0')
models_dict['train_rmse'].append(round(ll_rmse_train, 2))
models_dict['validate_rmse'].append(round(ll_rmse_validate, 2))
models_dict['difference'].append(round(ll_rmse_validate - ll_rmse_train, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.value,
                                                                 y_validate['ll_pred_1.0']), 2))

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [12]:
# Create a ll pred with alpha = 0.1
ll01 = LassoLars(alpha=0.1)
ll01.fit(x_train, y_train.value)
y_train['ll_pred_0.1'] = ll01.predict(x_train)
y_validate['ll_pred_0.1'] = ll01.predict(x_validate)
ll_rmse_train01 = mean_squared_error(y_train.value,
                                   y_train['ll_pred_0.1']) ** .5
ll_rmse_validate01 = mean_squared_error(y_validate.value,
                                      y_validate['ll_pred_0.1']) ** .5
models_dict['model_name'].append('ll_0.1')
models_dict['train_rmse'].append(round(ll_rmse_train01, 2))
models_dict['validate_rmse'].append(round(ll_rmse_validate01, 2))
models_dict['difference'].append(round(ll_rmse_validate01 - ll_rmse_train01, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.value,
                                                                 y_validate['ll_pred_0.1']), 2))

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [13]:
# Check 'models_dict'
pd.DataFrame(models_dict)

Unnamed: 0,model_name,train_rmse,validate_rmse,difference,validate_r2
0,baseline_mean,729581.05,849736.23,120155.17,0.0
1,lr,568509.46,670163.46,101654.0,0.38
2,ll_1.0,568509.74,670204.39,101694.66,0.38
3,ll_0.1,568509.46,670167.54,101658.07,0.38


<a id='tdr'></a>
# TweedieRegressor:
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [14]:
# Create train/validate predictions with TweedieRegressor
tdr0 = TweedieRegressor(power=0, alpha=1)
tdr0.fit(x_train, y_train.value)
y_train['tdr_pow0_a1'] = tdr0.predict(x_train)
y_validate['tdr_pow0_a1'] = tdr0.predict(x_validate)

# Get RMSE values and append to 'models_dict'
tdr_rmse_train0_1 = mean_squared_error(y_train.value,
                                   y_train['tdr_pow0_a1']) ** .5
tdr_rmse_validate0_1 = mean_squared_error(y_validate.value,
                                      y_validate['tdr_pow0_a1']) ** .5
models_dict['model_name'].append('tdr_pow0_a1')
models_dict['train_rmse'].append(round(tdr_rmse_train0_1, 2))
models_dict['validate_rmse'].append(round(tdr_rmse_validate0_1, 2))
models_dict['difference'].append(round(tdr_rmse_validate0_1 - tdr_rmse_train0_1, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.value,
                                                                 y_validate['tdr_pow0_a1']), 2))

In [15]:
# tdr power = 1
tdr1 = TweedieRegressor(power=1, alpha=1)
tdr1.fit(x_train, y_train.value)
y_train['tdr_pow1_a1'] = tdr1.predict(x_train)
y_validate['tdr_pow1_a1'] = tdr1.predict(x_validate)
tdr_rmse_train1_1 = mean_squared_error(y_train.value,
                                   y_train['tdr_pow1_a1']) ** .5
tdr_rmse_validate1_1 = mean_squared_error(y_validate.value,
                                      y_validate['tdr_pow1_a1']) ** .5
models_dict['model_name'].append('tdr_pow1_a1')
models_dict['train_rmse'].append(round(tdr_rmse_train1_1, 2))
models_dict['validate_rmse'].append(round(tdr_rmse_validate1_1, 2))
models_dict['difference'].append(round(tdr_rmse_validate1_1 - tdr_rmse_train1_1, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.value,
                                                                 y_validate['tdr_pow1_a1']), 2))

# tdr power = 2
tdr2 = TweedieRegressor(power=2, alpha=1)
tdr2.fit(x_train, y_train.value)
y_train['tdr_pow2_a1'] = tdr2.predict(x_train)
y_validate['tdr_pow2_a1'] = tdr2.predict(x_validate)
tdr_rmse_train2_1 = mean_squared_error(y_train.value,
                                   y_train['tdr_pow2_a1']) ** .5
tdr_rmse_validate2_1 = mean_squared_error(y_validate.value,
                                      y_validate['tdr_pow2_a1']) ** .5
models_dict['model_name'].append('tdr_pow2_a1')
models_dict['train_rmse'].append(round(tdr_rmse_train2_1, 2))
models_dict['validate_rmse'].append(round(tdr_rmse_validate2_1, 2))
models_dict['difference'].append(round(tdr_rmse_validate2_1 - tdr_rmse_train2_1, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.value,
                                                                 y_validate['tdr_pow2_a1']), 2))

# tdr power = 3
tdr3 = TweedieRegressor(power=1, alpha=1)
tdr3.fit(x_train, y_train.value)
y_train['tdr_pow3_a1'] = tdr3.predict(x_train)
y_validate['tdr_pow3_a1'] = tdr3.predict(x_validate)
tdr_rmse_train3_1 = mean_squared_error(y_train.value,
                                   y_train['tdr_pow3_a1']) ** .5
tdr_rmse_validate3_1 = mean_squared_error(y_validate.value,
                                      y_validate['tdr_pow3_a1']) ** .5
models_dict['model_name'].append('tdr_pow3_a1')
models_dict['train_rmse'].append(round(tdr_rmse_train3_1, 2))
models_dict['validate_rmse'].append(round(tdr_rmse_validate3_1, 2))
models_dict['difference'].append(round(tdr_rmse_validate3_1 - tdr_rmse_train3_1, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.value,
                                                                 y_validate['tdr_pow3_a1']), 2))

  return np.exp(lin_pred)
  return np.exp(lin_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  temp = d1 * family.deviance_derivative(y, y_pred, weights)
  devp = np.concatenate(([temp.sum()], temp @ X))
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
  return np.exp(lin_pred)
  return np.exp(lin_pred)
  return np.power(y_pred, self.power)
  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  return np.exp(lin_pred)
  return np.exp(lin

In [16]:
# Check 'models_dict'
pd.DataFrame(models_dict)

Unnamed: 0,model_name,train_rmse,validate_rmse,difference,validate_r2
0,baseline_mean,729581.05,849736.23,120155.17,0.0
1,lr,568509.46,670163.46,101654.0,0.38
2,ll_1.0,568509.74,670204.39,101694.66,0.38
3,ll_0.1,568509.46,670167.54,101658.07,0.38
4,tdr_pow0_a1,612628.43,726863.46,114235.04,0.27
5,tdr_pow1_a1,729581.05,849736.23,120155.17,0.0
6,tdr_pow2_a1,729581.05,849736.23,120155.17,0.0
7,tdr_pow3_a1,729581.05,849736.23,120155.17,0.0


<a id='pnr'></a>
# Polynomial Regression:
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [17]:
# Create polynomial features
poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(x_train)
x_validate_poly = poly.transform(x_validate)
x_test_poly = poly.transform(x_test)

In [18]:
# Create train/val preds using 'polynomial regression'
pnr = LinearRegression()
pnr.fit(x_train_poly, y_train.value)
y_train['poly'] = pnr.predict(x_train_poly)
y_validate['poly'] = pnr.predict(x_validate_poly)

# Get RMSE values and append to 'models_dict'
pnr_rmse_train = mean_squared_error(y_train.value,
                                   y_train['poly']) ** .5
pnr_rmse_validate = mean_squared_error(y_validate.value,
                                      y_validate['poly']) ** .5
models_dict['model_name'].append('poly')
models_dict['train_rmse'].append(round(pnr_rmse_train, 2))
models_dict['validate_rmse'].append(round(pnr_rmse_validate, 2))
models_dict['difference'].append(round(pnr_rmse_validate - pnr_rmse_train, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.value,
                                                                 y_validate['poly']), 2))

In [19]:
# Check 'models_dict'
pd.DataFrame(models_dict)

Unnamed: 0,model_name,train_rmse,validate_rmse,difference,validate_r2
0,baseline_mean,729581.05,849736.23,120155.17,0.0
1,lr,568509.46,670163.46,101654.0,0.38
2,ll_1.0,568509.74,670204.39,101694.66,0.38
3,ll_0.1,568509.46,670167.54,101658.07,0.38
4,tdr_pow0_a1,612628.43,726863.46,114235.04,0.27
5,tdr_pow1_a1,729581.05,849736.23,120155.17,0.0
6,tdr_pow2_a1,729581.05,849736.23,120155.17,0.0
7,tdr_pow3_a1,729581.05,849736.23,120155.17,0.0
8,poly,543414.52,572507.99,29093.47,0.55


<a id='top'></a>
# Top Model:
<li><a href='#TableOfContents'>Table of Contents</a></li>

##### Use poly on test

In [20]:
y_test['poly'] = pnr.predict(x_test_poly)
test_r2 = round(explained_variance_score(y_test.value,
                        y_test.poly), 2)
test_r2


0.49

In [21]:
test_rmse = round(mean_squared_error(y_test.value,
                   y_test.poly) ** .5, 2)
test_rmse

491313.79

In [22]:
y_test['baseline'] = mean_baseline

In [23]:
base_rmse = round(mean_squared_error(y_test.value,
                   y_test.baseline) ** .5, 2)
base_rmse

685403.82

In [24]:
base_r2 = round(explained_variance_score(y_test.value,
                                         y_test.baseline), 2)
base_r2

0.0

In [25]:
test_dict = {
    'model' : ['baseline', 'poly'],
    'type' : ['baseline(mean)', 'Polynomial Regression'],
    'rmse' : [base_rmse, test_rmse],
    'r2' : [base_r2, test_r2]
}

In [26]:
pd.DataFrame(test_dict)

Unnamed: 0,model,type,rmse,r2
0,baseline,baseline(mean),685403.82,0.0
1,poly,Polynomial Regression,491313.79,0.49


<a id='extra'></a>
# Extra:
<li><a href='#TableOfContents'>Table of Contents</a></li>