<a id="TableOfContents"></a>
# TABLE OF CONTENTS:
<li><a href='#imports'>Imports</a></li>
<li><a href="#Q1">Question 1</a></li>
<li><a href='#Q2'>Question 2</a></li>
<li><a href='#Q3'>Question 3</a></li>

<a id='imports'></a>
# IMPORTS:
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [1]:
# Vectorization and tables
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Datasets
from pydataset import data

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoLars
from sklearn.linear_model import TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

# .py files
import wrangle

<a id='Q1'></a>
# Question 1:
<li><a href='#TableOfContents'>Table of Contents</a></li>

### 1. Select a dataset with a continuous target variable.

In [2]:
# Obtain zillow df, split, then reduce sample size
train, validate, test = wrangle.wrangle_zillow()
train_sample, validate_sample, test_sample = wrangle.sample_dataframe(train, validate, test)

train.shape:(1069116, 12)
validate.shape:(458193, 12)
test.shape:(381828, 12)


<a id='Q2'></a>
# Question 2:
<li><a href='#TableOfContents'>Table of Contents</a></li>

### 2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [3]:
# Verify smaller sample size
train_sample.shape, validate_sample.shape, test_sample.shape

((1000, 12), (428, 12), (357, 12))

<a id='Q3'></a>
# Question 3:
<li><a href='#TableOfContents'>Table of Contents</a></li>
<li><a href='#initial'>Initial</a></li>
<li><a href='#baseline'>Baseline</a></li>
<li><a href='#ols'>LinearRegression(OLS)</a></li>
<li><a href='#ll'>LassoLars</a></li>
<li><a href='#tdr'>TweedieRegressor</a></li>
<li><a href='#pnr'>Polynomial Regression</a></li>
<li><a href='#best'>Best Model Testing</a></li>

### 3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

<a id='initial'></a>
##### Initial
<li><a href='#Q3'>Question 3 Top</a></li>

In [4]:
cols = [
    'bedroomcnt',
    'bathroomcnt',
    'sqrft'
]

In [5]:
# Scale dataframes
train_sample_scaled, validate_sample_scaled, test_sample_scaled = wrangle.scale(train_sample, validate_sample, test_sample, cols)

TypeError: scale() missing 1 required positional argument: 'scaler'

In [None]:
# Define x_col, y_col for train, validate, test
x_train = train_sample_scaled.drop(columns=['assessedvalue', 'taxamount'])
y_train = pd.DataFrame(train_sample_scaled.assessedvalue)
x_validate = validate_sample_scaled.drop(columns=['assessedvalue', 'taxamount'])
y_validate = pd.DataFrame(validate_sample_scaled.assessedvalue)
x_test = test_sample_scaled.drop(columns=['assessedvalue', 'taxamount'])
y_test = pd.DataFrame(test_sample_scaled.assessedvalue)

<a id='baseline'></a>
##### Baseline
<li><a href='#Q3'>Question 3 Top</a></li>

In [None]:
# Create mean and median baseline values
baseline_mean = y_train.assessedvalue.mean()
y_train['baseline_mean'] = baseline_mean
y_validate['baseline_mean'] = baseline_mean
baseline_median = y_train.assessedvalue.median()
y_train['baseline_median'] = baseline_median
y_validate['baseline_median'] = baseline_median

In [None]:
# Compare mean/median RMSE scores
rmse_train_mean = mean_squared_error(y_train.assessedvalue,
                                    y_train.baseline_mean) ** .5
rmse_validate_mean = mean_squared_error(y_validate.assessedvalue,
                                    y_validate.baseline_mean) ** .5
rmse_train_median = mean_squared_error(y_train.assessedvalue,
                                    y_train.baseline_median) ** .5
rmse_validate_median = mean_squared_error(y_validate.assessedvalue,
                                    y_validate.baseline_median) ** .5

print('\033[35m ========== MEAN RMSE SCORES ==========\033[0m')
print(f'\033[32mTrain:\033[0m {rmse_train_mean}\n\033[32mValidate:\033[0m {rmse_validate_mean}\n\033[32mDifference:\033[0m {rmse_validate_mean - rmse_train_mean}')
print('\n\n\033[35m ========== MEDIAN RMSE SCORES ==========\033[0m')
print(f'\033[32mTrain:\033[0m {rmse_train_median}\n\033[32mValidate:\033[0m {rmse_validate_median}\n\033[32mDifference:\033[0m {rmse_validate_median - rmse_train_median}')

### Mean is the better baseline

In [None]:
# Create a dictionary of model scores
models_dict = {
    'model_name' : ['baseline_mean'],
    'model_type' : ['baseline'],
    'train_rmse' : [round(rmse_train_mean, 2)],
    'validate_rmse' : [round(rmse_validate_mean, 2)],
    'difference' : [round(rmse_validate_mean - rmse_train_mean, 2)],
    'validate_r2' : [round(explained_variance_score(y_validate.assessedvalue,
                                             y_validate.baseline_mean), 2)]
}

In [None]:
# Verify compatability with pandas
pd.DataFrame([models_dict])

<a id='ols'></a>
##### LinearRegression(OLS)
<li><a href='#Q3'>Question 3 Top</a></li>

In [None]:
# Create train/validate predictions with lr modeling
lr = LinearRegression()
lr.fit(x_train, y_train.assessedvalue)
y_train['lr_pred'] = lr.predict(x_train)
y_validate['lr_pred'] = lr.predict(x_validate)

In [None]:
# Get RMSE vals for train/validate
# Append to 'models_dict'
lr_rmse_train = mean_squared_error(y_train.assessedvalue,
                                   y_train.lr_pred) ** .5
lr_rmse_validate = mean_squared_error(y_validate.assessedvalue,
                                      y_validate.lr_pred) ** .5
models_dict['model_name'].append('lr')
models_dict['model_type'].append('LinearRegression')
models_dict['train_rmse'].append(round(lr_rmse_train, 2))
models_dict['validate_rmse'].append(round(lr_rmse_validate, 2))
models_dict['difference'].append(round(lr_rmse_validate - lr_rmse_train, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.assessedvalue,
                                                                 y_validate.lr_pred), 2))

In [None]:
pd.DataFrame(models_dict)

<a id='ll'></a>
##### LassoLars
<li><a href='#Q3'>Question 3 Top</a></li>

In [None]:
# Create train/validate predictions with LassoLars
ll1 = LassoLars(alpha=1)
ll1.fit(x_train, y_train.assessedvalue)
y_train['ll_pred_1.0'] = ll1.predict(x_train)
y_validate['ll_pred_1.0'] = ll1.predict(x_validate)

In [None]:
# Get RMSE values and append to 'models_dict'
ll_rmse_train = mean_squared_error(y_train.assessedvalue,
                                   y_train['ll_pred_1.0']) ** .5
ll_rmse_validate = mean_squared_error(y_validate.assessedvalue,
                                      y_validate['ll_pred_1.0']) ** .5
models_dict['model_name'].append('ll_1.0')
models_dict['model_type'].append('LassoLars')
models_dict['train_rmse'].append(round(ll_rmse_train, 2))
models_dict['validate_rmse'].append(round(ll_rmse_validate, 2))
models_dict['difference'].append(round(ll_rmse_validate - ll_rmse_train, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.assessedvalue,
                                                                 y_validate['ll_pred_1.0']), 2))

In [None]:
# Create a ll pred with alpha = 0.1
ll01 = LassoLars(alpha=0.1)
ll01.fit(x_train, y_train.assessedvalue)
y_train['ll_pred_0.1'] = ll01.predict(x_train)
y_validate['ll_pred_0.1'] = ll01.predict(x_validate)
ll_rmse_train01 = mean_squared_error(y_train.assessedvalue,
                                   y_train['ll_pred_0.1']) ** .5
ll_rmse_validate01 = mean_squared_error(y_validate.assessedvalue,
                                      y_validate['ll_pred_0.1']) ** .5
models_dict['model_name'].append('ll_0.1')
models_dict['model_type'].append('LassoLars')
models_dict['train_rmse'].append(round(ll_rmse_train01, 2))
models_dict['validate_rmse'].append(round(ll_rmse_validate01, 2))
models_dict['difference'].append(round(ll_rmse_validate01 - ll_rmse_train01, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.assessedvalue,
                                                                 y_validate['ll_pred_0.1']), 2))

In [None]:
pd.DataFrame(models_dict)

<a id='tdr'></a>
##### TweedieRegressor
<li><a href='#Q3'>Question 3 Top</a></li>

In [None]:
# Create train/validate predictions with TweedieRegressor
tdr0 = TweedieRegressor(power=0, alpha=1)
tdr0.fit(x_train, y_train.assessedvalue)
y_train['tdr_pow0_a1'] = tdr0.predict(x_train)
y_validate['tdr_pow0_a1'] = tdr0.predict(x_validate)

In [None]:
# Get RMSE values and append to 'models_dict'
tdr_rmse_train0_1 = mean_squared_error(y_train.assessedvalue,
                                   y_train['tdr_pow0_a1']) ** .5
tdr_rmse_validate0_1 = mean_squared_error(y_validate.assessedvalue,
                                      y_validate['tdr_pow0_a1']) ** .5
models_dict['model_name'].append('tdr_pow0_a1')
models_dict['model_type'].append('TweedieRegressor')
models_dict['train_rmse'].append(round(tdr_rmse_train0_1, 2))
models_dict['validate_rmse'].append(round(tdr_rmse_validate0_1, 2))
models_dict['difference'].append(round(tdr_rmse_validate0_1 - tdr_rmse_train0_1, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.assessedvalue,
                                                                 y_validate['tdr_pow0_a1']), 2))

In [None]:
# tdr power = 1
tdr1 = TweedieRegressor(power=1, alpha=1)
tdr1.fit(x_train, y_train.assessedvalue)
y_train['tdr_pow1_a1'] = tdr1.predict(x_train)
y_validate['tdr_pow1_a1'] = tdr1.predict(x_validate)
tdr_rmse_train1_1 = mean_squared_error(y_train.assessedvalue,
                                   y_train['tdr_pow1_a1']) ** .5
tdr_rmse_validate1_1 = mean_squared_error(y_validate.assessedvalue,
                                      y_validate['tdr_pow1_a1']) ** .5
models_dict['model_name'].append('tdr_pow1_a1')
models_dict['model_type'].append('TweedieRegressor')
models_dict['train_rmse'].append(round(tdr_rmse_train1_1, 2))
models_dict['validate_rmse'].append(round(tdr_rmse_validate1_1, 2))
models_dict['difference'].append(round(tdr_rmse_validate1_1 - tdr_rmse_train1_1, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.assessedvalue,
                                                                 y_validate['tdr_pow1_a1']), 2))

# tdr power = 2
tdr2 = TweedieRegressor(power=2, alpha=1)
tdr2.fit(x_train, y_train.assessedvalue)
y_train['tdr_pow2_a1'] = tdr2.predict(x_train)
y_validate['tdr_pow2_a1'] = tdr2.predict(x_validate)
tdr_rmse_train2_1 = mean_squared_error(y_train.assessedvalue,
                                   y_train['tdr_pow2_a1']) ** .5
tdr_rmse_validate2_1 = mean_squared_error(y_validate.assessedvalue,
                                      y_validate['tdr_pow2_a1']) ** .5
models_dict['model_name'].append('tdr_pow2_a1')
models_dict['model_type'].append('TweedieRegressor')
models_dict['train_rmse'].append(round(tdr_rmse_train2_1, 2))
models_dict['validate_rmse'].append(round(tdr_rmse_validate2_1, 2))
models_dict['difference'].append(round(tdr_rmse_validate2_1 - tdr_rmse_train2_1, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.assessedvalue,
                                                                 y_validate['tdr_pow2_a1']), 2))

# tdr power = 3
tdr3 = TweedieRegressor(power=1, alpha=1)
tdr3.fit(x_train, y_train.assessedvalue)
y_train['tdr_pow3_a1'] = tdr3.predict(x_train)
y_validate['tdr_pow3_a1'] = tdr3.predict(x_validate)
tdr_rmse_train3_1 = mean_squared_error(y_train.assessedvalue,
                                   y_train['tdr_pow3_a1']) ** .5
tdr_rmse_validate3_1 = mean_squared_error(y_validate.assessedvalue,
                                      y_validate['tdr_pow3_a1']) ** .5
models_dict['model_name'].append('tdr_pow3_a1')
models_dict['model_type'].append('TweedieRegressor')
models_dict['train_rmse'].append(round(tdr_rmse_train3_1, 2))
models_dict['validate_rmse'].append(round(tdr_rmse_validate3_1, 2))
models_dict['difference'].append(round(tdr_rmse_validate3_1 - tdr_rmse_train3_1, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.assessedvalue,
                                                                 y_validate['tdr_pow3_a1']), 2))

In [None]:
pd.DataFrame(models_dict)

<a id='pnr'></a>
##### Polynomial Regression
<li><a href='#Q3'>Question 3 Top</a></li>

In [None]:
# Create polynomial features
poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(x_train)
x_validate_poly = poly.transform(x_validate)
x_test_poly = poly.transform(x_test)

In [None]:
# Create train/val preds using 'polynomial regression'
pnr = LinearRegression()
pnr.fit(x_train_poly, y_train.assessedvalue)
y_train['poly'] = pnr.predict(x_train_poly)
y_validate['poly'] = pnr.predict(x_validate_poly)

In [None]:
# Get RMSE values and append to 'models_dict'
pnr_rmse_train = mean_squared_error(y_train.assessedvalue,
                                   y_train['poly']) ** .5
pnr_rmse_validate = mean_squared_error(y_validate.assessedvalue,
                                      y_validate['poly']) ** .5
models_dict['model_name'].append('poly')
models_dict['model_type'].append('Polynomial Regression')
models_dict['train_rmse'].append(round(pnr_rmse_train, 2))
models_dict['validate_rmse'].append(round(pnr_rmse_validate, 2))
models_dict['difference'].append(round(pnr_rmse_validate - pnr_rmse_train, 2))
models_dict['validate_r2'].append(round(explained_variance_score(y_validate.assessedvalue,
                                                                 y_validate['poly']), 2))

In [None]:
pd.DataFrame(models_dict)

<a id='best'></a>
##### Best Model Testing
<li><a href='#Q3'>Question 3 Top</a></li>

'll_0.1' is best model

In [None]:
# Make predictions on test
y_test['ll_0.1'] = ll01.predict(x_test)

In [None]:
# Append to models_dict
testll01_rmse_test = mean_squared_error(y_test.assessedvalue,
                                   y_test['ll_0.1']) ** .5
models_dict['model_name'].append('TESTING_ll01')
models_dict['model_type'].append('LassoLars')
models_dict['train_rmse'].append(0)
models_dict['validate_rmse'].append(round(testll01_rmse_test, 2))
models_dict['difference'].append(0)
models_dict['validate_r2'].append(round(explained_variance_score(y_test.assessedvalue,
                                                                 y_test['ll_0.1']), 2))

In [None]:
pd.DataFrame(models_dict)