In [140]:
import pandas as pd
import numpy as np
import wrangle
import matplotlib.pyplot as plt

# modeling methods
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score
from sklearn.linear_model import TweedieRegressor

import acquire
import prepare

import warnings
warnings.filterwarnings("ignore")

#### Select a dataset with a continuous target variable.

In [141]:
df = acquire.get_zillow_data()

In [142]:
df.head()

Unnamed: 0,bedroomcnt,bathroomcnt,calculatedfinishedsquarefeet,taxvaluedollarcnt,yearbuilt,taxamount,fips
0,0.0,0.0,,27516.0,,,6037.0
1,0.0,0.0,,10.0,,,6037.0
2,0.0,0.0,,10.0,,,6037.0
3,0.0,0.0,,2108.0,,174.21,6037.0
4,4.0,2.0,3633.0,296425.0,2005.0,6941.39,6037.0


#### Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [143]:
df = acquire.clean_zillow(df)

In [144]:
df = df.dropna()

In [145]:
df.head()

Unnamed: 0,sqft,baths,beds,tax_value,year_built,tax_amount,fips
4,3633.0,2.0,4.0,296425.0,2005.0,6941.39,6037.0
6,1620.0,4.0,3.0,847770.0,2011.0,10244.94,6037.0
7,2077.0,2.0,3.0,646760.0,1926.0,7924.68,6037.0
11,1200.0,0.0,0.0,5328.0,1972.0,91.6,6037.0
14,171.0,0.0,0.0,6920.0,1973.0,255.17,6037.0


In [146]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test = prepare.train_validate_test(df,"tax_value")

In [147]:
scaler, X_train_scaled, X_validate_scaled, X_test_scaled = prepare.Standard_Scaler(X_train,X_validate,X_test)

In [148]:
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

In [149]:
y_train.head()

Unnamed: 0,tax_value
1567873,191578.0
1218138,1345206.0
339661,356648.0
1017133,175069.0
40250,543000.0


#### Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.

In [150]:
tax_value_pred_mean = y_train.tax_value.mean()
y_train['tax_value_pred_mean'] = tax_value_pred_mean
y_validate['tax_value_pred_mean'] = tax_value_pred_mean

In [151]:
tax_value_pred_median = y_train.tax_value.median()
y_train['tax_value_pred_median'] = tax_value_pred_median
y_validate['tax_value_pred_median'] = tax_value_pred_median

In [152]:
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_mean) ** .5
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_mean) ** (1/2)

In [153]:
print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

# 4. RMSE of tax_value_pred_median
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_median) ** .5
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_median) ** .5
print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

RMSE using Mean
Train/In-Sample:  688918.84 
Validate/Out-of-Sample:  668128.53
RMSE using Median
Train/In-Sample:  701590.78 
Validate/Out-of-Sample:  680685.64


In [154]:
def make_metric_df(y, y_pred, model_name, metric_df):
    if metric_df.size ==0:
        metric_df = pd.DataFrame(data=[
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }])
        return metric_df
    else:
        return metric_df.append(
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }, ignore_index=True)

In [155]:
metric_df = pd.DataFrame()
metric_df = make_metric_df(y_train.tax_value,
                           y_train.tax_value_pred_median,
                           'median_baseline',
                          metric_df)

In [156]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,701590.778243,0.0


In [157]:
metric_df = make_metric_df(y_train.tax_value, y_train.tax_value_pred_mean, "mean_baseline", metric_df)

In [158]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,701590.778243,0.0
1,mean_baseline,688918.835407,0.0


In [159]:
lm = LinearRegression(normalize=True)

In [160]:
lm.fit(X_train_scaled, y_train.tax_value)

LinearRegression(normalize=True)

In [161]:
y_train['tax_value_pred_lm'] = lm.predict(X_train_scaled)

In [162]:
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lm) ** (1/2)

# predict validate
y_validate['tax_value_pred_lm'] = lm.predict(X_validate)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_lm) ** (1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for OLS using LinearRegression
Training/In-Sample:  89304.36690635968 
Validation/Out-of-Sample:  6550820002.769056


In [163]:
y_train.head()

Unnamed: 0,tax_value,tax_value_pred_mean,tax_value_pred_median,tax_value_pred_lm
1567873,191578.0,461370.07448,328628.0,197622.9
1218138,1345206.0,461370.07448,328628.0,1393201.0
339661,356648.0,461370.07448,328628.0,359511.9
1017133,175069.0,461370.07448,328628.0,198478.1
40250,543000.0,461370.07448,328628.0,532943.0


In [164]:
metric_df = make_metric_df(y_train.tax_value, y_train.tax_value_pred_lm, "lm_model", metric_df)

In [165]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,701590.778243,0.0
1,mean_baseline,688918.835407,0.0
2,lm_model,89304.366906,0.983196


In [166]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series!
lars.fit(X_train_scaled, y_train.tax_value)

# predict train
y_train['tax_value_pred_lars'] = lars.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lars) ** (1/2)

# predict validate
y_validate['tax_value_pred_lars'] = lars.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_lars) ** (1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Lasso + Lars
Training/In-Sample:  89380.67787450404 
Validation/Out-of-Sample:  88492.38339160997


In [167]:
metric_df = make_metric_df(y_validate.tax_value,
               y_validate.tax_value_pred_lars,
               'lasso_alpha_1',
               metric_df)

In [168]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,701590.778243,0.0
1,mean_baseline,688918.835407,0.0
2,lm_model,89304.366906,0.983196
3,lasso_alpha_1,88492.383392,0.982457


In [169]:
# create the model object
glm = TweedieRegressor(power=1, alpha=0)


# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train_scaled, y_train.tax_value)

# predict train
y_train['tax_value_pred_glm'] = glm.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_glm) ** (1/2)

# predict validate
y_validate['tax_value_pred_glm'] = glm.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_glm) ** (1/2)

print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for GLM using Tweedie, power=1 & alpha=0
Training/In-Sample:  3602240.4933254234 
Validation/Out-of-Sample:  3602240.4933254234


In [170]:
metric_df = make_metric_df(y_validate.tax_value,
               y_validate.tax_value_pred_glm,
               'glm_poisson',
               metric_df)

In [171]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,701590.8,0.0
1,mean_baseline,688918.8,0.0
2,lm_model,89304.37,0.983196
3,lasso_alpha_1,88492.38,0.982457
4,glm_poisson,7739156.0,-133.175373


In [172]:
# make the polynomial features to get a new set of features
pf = PolynomialFeatures(degree=2)

# fit and transform X_train_scaled
X_train_degree2 = pf.fit_transform(X_train_scaled)

# transform X_validate_scaled & X_test_scaled
X_validate_degree2 = pf.transform(X_validate_scaled)
X_test_degree2 =  pf.transform(X_test_scaled)

In [173]:
# create the model object
lm2 = LinearRegression(normalize=True)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lm2.fit(X_train_degree2, y_train.tax_value)

# predict train
y_train['tax_value_pred_lm2'] = lm2.predict(X_train_degree2)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lm2) ** (1/2)

# predict validate
y_validate['tax_value_pred_lm2'] = lm2.predict(X_validate_degree2)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_lm2)

print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Polynomial Model, degrees=2
Training/In-Sample:  85861.92084762435 
Validation/Out-of-Sample:  7206129959.404406


In [174]:
metric_df = make_metric_df(y_validate.tax_value,
               y_validate.tax_value_pred_lm2,
               'quadratic',
               metric_df)

In [175]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,701590.8,0.0
1,mean_baseline,688918.8,0.0
2,lm_model,89304.37,0.983196
3,lasso_alpha_1,88492.38,0.982457
4,glm_poisson,7739156.0,-133.175373
5,quadratic,84888.93,0.983857


In [177]:
y_test = pd.DataFrame(y_test)

# predict on test
y_test['tax_value_pred_lm'] = lm.predict(X_test_scaled)

# evaluate: rmse
rmse_test = mean_squared_error(y_test.tax_value, y_test.tax_value_pred_lm) ** (1/2)

print("RMSE for OLS Model using LinearRegression\nOut-of-Sample Performance: ", rmse_test)

RMSE for OLS Model using LinearRegression
Out-of-Sample Performance:  82460.68931675899
