<a id="TableOfContents"></a>
# TABLE OF CONTENTS:
<li><a href='#imports'>Imports</a></li>
<li><a href="#Q1">Question 1</a></li>
<li><a href='#Q2'>Question 2</a></li>
<li><a href='#Q3'>Question 3</a></li>

<a id='imports'></a>
# IMPORTS:
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [1]:
# Vectorization and tables
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Datasets
from pydataset import data

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoLars
from sklearn.linear_model import TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error

# .py files
import wrangle

<a id='Q1'></a>
# Question 1:
<li><a href='#TableOfContents'>Table of Contents</a></li>

### 1. Select a dataset with a continuous target variable.

In [2]:
# Obtain zillow df, split, then reduce sample size
train, validate, test = wrangle.wrangle_zillow()
train_sample, validate_sample, test_sample = wrangle.sample_dataframe(train, validate, test)

train.shape:(1069116, 12)
validate.shape:(458193, 12)
test.shape:(381828, 12)


<a id='Q2'></a>
# Question 2:
<li><a href='#TableOfContents'>Table of Contents</a></li>

### 2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [3]:
# Verify smaller sample size
train_sample.shape, validate_sample.shape, test_sample.shape

((1000, 12), (428, 12), (357, 12))

<a id='Q3'></a>
# Question 3:
<li><a href='#TableOfContents'>Table of Contents</a></li>
<li><a href='#initial'>Initial</a></li>
<li><a href='#baseline'>Baseline</a></li>
<li><a href='#ols'>LinearRegression(OLS)</a></li>
<li><a href='#ll'>LassoLars</a></li>
<li><a href='#tdr'>TweedieRegressor</a></li>
<li><a href='#pnr'>Polynomial Regression</a></li>

### 3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

<a id='initial'></a>
##### Initial
<li><a href='#Q3'>Question 3 Top</a></li>

In [4]:
# Define x_col, y_col for train, validate, test
x_train = train_sample.drop(columns='assessedvalue')
y_train = pd.DataFrame(train_sample.assessedvalue)
x_validate = validate_sample.drop(columns='assessedvalue')
y_validate = pd.DataFrame(validate_sample.assessedvalue)
x_test = test_sample.drop(columns='assessedvalue')
y_test = pd.DataFrame(test_sample.assessedvalue)

<a id='baseline'></a>
##### Baseline
<li><a href='#Q3'>Question 3 Top</a></li>

In [5]:
# Create mean and median baseline values
baseline_mean = y_train.assessedvalue.mean()
y_train['baseline_mean'] = baseline_mean
y_validate['baseline_mean'] = baseline_mean
baseline_median = y_train.assessedvalue.median()
y_train['baseline_median'] = baseline_median
y_validate['baseline_median'] = baseline_median

In [6]:
# Compare mean/median RMSE scores
rmse_train_mean = mean_squared_error(y_train.assessedvalue,
                                    y_train.baseline_mean) ** .5
rmse_validate_mean = mean_squared_error(y_validate.assessedvalue,
                                    y_validate.baseline_mean) ** .5
rmse_train_median = mean_squared_error(y_train.assessedvalue,
                                    y_train.baseline_median) ** .5
rmse_validate_median = mean_squared_error(y_validate.assessedvalue,
                                    y_validate.baseline_median) ** .5

print('\033[35m ========== MEAN RMSE SCORES ==========\033[0m')
print(f'\033[32mTrain:\033[0m {rmse_train_mean}\n\033[32mValidate:\033[0m {rmse_validate_mean}\n\033[32mDifference:\033[0m {rmse_validate_mean - rmse_train_mean}')
print('\n\n\033[35m ========== MEDIAN RMSE SCORES ==========\033[0m')
print(f'\033[32mTrain:\033[0m {rmse_train_median}\n\033[32mValidate:\033[0m {rmse_validate_median}\n\033[32mDifference:\033[0m {rmse_validate_median - rmse_train_median}')

[32mTrain:[0m 213250.76778379083
[32mValidate:[0m 211455.99499646705
[32mDifference:[0m -1794.7727873237745


[32mTrain:[0m 215723.32074742
[32mValidate:[0m 213096.79189185478
[32mDifference:[0m -2626.528855565208


### Mean is the better baseline

In [7]:
# Create a dictionary of model scores
models_dict['baseline'] = {
    'model_name' : 'baseline_mean',
    'model_type' : 'baseline',
    'train_rmse' : round(rmse_train_mean, 2),
    'validate_rmse' : round(rmse_validate_mean, 2),
    'difference' : round(rmse_validate_mean - rmse_train_mean, 2)
}

In [8]:
# Verify compatability with pandas
pd.DataFrame([models_dict])

Unnamed: 0,model_name,model_type,train_rmse,validate_rmse,difference
0,baseline_mean,baseline,213250.77,211455.99,-1794.77


<a id='ols'></a>
##### LinearRegression(OLS)
<li><a href='#Q3'>Question 3 Top</a></li>

In [10]:
# Create train/validate predictions with lr modeling
lr = LinearRegression()
lr.fit(x_train, y_train.assessedvalue)
y_train['lr_pred'] = lr.predict(x_train)
y_validate['lr_pred'] = lr.predict(x_validate)

In [13]:
# Get RMSE vals for train/validate
# Append to 'models_dict'
lr_rmse_train = mean_squared_error(y_train.assessedvalue,
                                   y_train.lr_pred) ** .5
lr_rmse_validate = mean_squared_error(y_validate.assessedvalue,
                                      y_validate.lr_pred) ** .5
models_dict['lr'] = {
    'model_name' : 'lr',
    'model_type' : 'LinearRegression',
    'train_rmse' : round(lr_rmse_train, 2),
    'validate_rmse' : round(lr_rmse_validate, 2),
    'difference' : round(lr_rmse_validate - lr_rmse_train, 2)
}

In [14]:
models_dict

{'model_name': 'baseline_mean',
 'model_type': 'baseline',
 'train_rmse': 213250.77,
 'validate_rmse': 211455.99,
 'difference': -1794.77,
 'lr': {'model_name': 'lr',
  'model_type': 'LinearRegression',
  'train_rmse': 42531.01,
  'validate_rmse': 37582.44,
  'difference': -4948.56}}

In [None]:
# Get RMSE values of lr modeling
lr_RMSE = 

<a id='ll'></a>
##### LassoLars
<li><a href='#Q3'>Question 3 Top</a></li>

<a id='tdr'></a>
##### TweedieRegressor
<li><a href='#Q3'>Question 3 Top</a></li>

<a id='pnr'></a>
##### Polynomial Regression
<li><a href='#Q3'>Question 3 Top</a></li>