In [212]:
import pandas as pd
import numpy as np
import wrangle
import matplotlib.pyplot as plt

# modeling methods
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score
from sklearn.linear_model import TweedieRegressor

import acquire
import prepare
import wrangle

import warnings
warnings.filterwarnings("ignore")

In [213]:
train, validate, test, X_train, y_train, X_validate, y_validate, X_test, y_test, scaler, X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.wrangle_zillow(target = "tax_value")

In [214]:
cols = ["sqft","bathrooms","bedrooms","year_built"]
X_train_scaled = X_train_scaled[cols]
X_validate_scaled = X_validate_scaled[cols]
X_test_scaled = X_test_scaled[cols]

In [215]:
X_train_scaled

Unnamed: 0,sqft,bathrooms,bedrooms,year_built
19637,-0.065057,0.821149,0.922538,1.696089
28369,-0.865429,-0.287748,-1.093892,-3.026225
10647,-0.165403,0.266700,-0.085677,0.270485
26643,-0.960996,-0.287748,-1.093892,0.092284
25170,0.052012,0.821149,-1.093892,1.517889
...,...,...,...,...
19891,1.596610,0.821149,0.922538,1.696089
34780,-0.309947,-0.287748,0.922538,-0.264117
2501,0.575240,0.266700,1.930752,-0.130467
30791,-0.535724,-1.396644,-1.093892,-2.447074


In [219]:
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)
y_test = pd.DataFrame(y_test)

In [220]:
tax_value_pred_mean = y_train.tax_value.mean()
y_train['tax_value_pred_mean'] = tax_value_pred_mean
y_validate['tax_value_pred_mean'] = tax_value_pred_mean

In [221]:
tax_value_pred_median = y_train.tax_value.median()
y_train['tax_value_pred_median'] = tax_value_pred_median
y_validate['tax_value_pred_median'] = tax_value_pred_median

In [222]:
y_train

Unnamed: 0,tax_value,tax_value_pred_mean,tax_value_pred_median
19637,551322.0,450983.8829,357000.0
28369,138000.0,450983.8829,357000.0
10647,441532.0,450983.8829,357000.0
26643,284000.0,450983.8829,357000.0
25170,1352103.0,450983.8829,357000.0
...,...,...,...
19891,316871.0,450983.8829,357000.0
34780,72643.0,450983.8829,357000.0
2501,725975.0,450983.8829,357000.0
30791,300252.0,450983.8829,357000.0


In [223]:
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_mean) ** .5
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_mean) ** (1/2)

In [224]:
print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

# 4. RMSE of tax_value_pred_median
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_median) ** .5
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_median) ** .5
print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

RMSE using Mean
Train/In-Sample:  372946.47 
Validate/Out-of-Sample:  369969.14
RMSE using Median
Train/In-Sample:  384606.35 
Validate/Out-of-Sample:  380311.79


In [225]:
def make_metric_df(y, y_pred, model_name, metric_df):
    if metric_df.size ==0:
        metric_df = pd.DataFrame(data=[
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }])
        return metric_df
    else:
        return metric_df.append(
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }, ignore_index=True)

In [226]:
metric_df = pd.DataFrame()
metric_df = make_metric_df(y_train.tax_value,
                           y_train.tax_value_pred_median,
                           'median_baseline',
                          metric_df)

In [227]:
metric_df = make_metric_df(y_train.tax_value,
                           y_train.tax_value_pred_mean,
                           'mean_baseline',
                          metric_df)

In [228]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,384606.346892,0.0
1,mean_baseline,372946.473136,0.0


In [229]:
lm = LinearRegression(normalize=True)

In [230]:
lm.fit(X_train_scaled, y_train.tax_value)

LinearRegression(normalize=True)

In [231]:
y_train['tax_value_pred_lm'] = lm.predict(X_train_scaled)

In [232]:
y_validate

Unnamed: 0,tax_value,tax_value_pred_mean,tax_value_pred_median
21930,543000.0,450983.8829,357000.0
20472,252215.0,450983.8829,357000.0
33133,84081.0,450983.8829,357000.0
11204,192982.0,450983.8829,357000.0
19676,271967.0,450983.8829,357000.0
...,...,...,...
36673,287000.0,450983.8829,357000.0
1908,923000.0,450983.8829,357000.0
1535,205013.0,450983.8829,357000.0
31324,727000.0,450983.8829,357000.0


In [233]:
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lm) ** (1/2)

# predict validate
y_validate['tax_value_pred_lm'] = lm.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_lm) ** (1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)
r2_score_lm = r2_score(y_validate.tax_value, y_validate.tax_value_pred_lm)
print(f"R^2 value for LinearRegression is {round(r2_score_lm,4)}")

RMSE for OLS using LinearRegression
Training/In-Sample:  293749.7782815386 
Validation/Out-of-Sample:  289873.6875660112
R^2 value for LinearRegression is 0.386


In [234]:
metric_df = make_metric_df(y_train.tax_value, y_train.tax_value_pred_lm, "lm_model", metric_df)

In [235]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,384606.346892,0.0
1,mean_baseline,372946.473136,0.0
2,lm_model,293749.778282,0.379614


In [236]:
# create the model object
lars = LassoLars(alpha=1)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series!
lars.fit(X_train_scaled, y_train.tax_value)

# predict train
y_train['tax_value_pred_lars'] = lars.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lars) ** (1/2)

# predict validate
y_validate['tax_value_pred_lars'] = lars.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_lars) ** (1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)
r2_score_lars = r2_score(y_validate.tax_value, y_validate.tax_value_pred_lars)
print(f"R^2 value for Lasso + Lars is {round(r2_score_lars,4)}")

RMSE for Lasso + Lars
Training/In-Sample:  293750.12111328973 
Validation/Out-of-Sample:  289881.15280912694
R^2 value for Lasso + Lars is 0.3859


In [237]:
metric_df = make_metric_df(y_validate.tax_value,
               y_validate.tax_value_pred_lars,
               'lasso_alpha_1',
               metric_df)

In [238]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,384606.346892,0.0
1,mean_baseline,372946.473136,0.0
2,lm_model,293749.778282,0.379614
3,lasso_alpha_1,289881.152809,0.386234


In [239]:
# create the model object
glm = TweedieRegressor(power=1, alpha=0)


# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
glm.fit(X_train_scaled, y_train.tax_value)

# predict train
y_train['tax_value_pred_glm'] = glm.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_glm) ** (1/2)

# predict validate
y_validate['tax_value_pred_glm'] = glm.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_glm) ** (1/2)

print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)
r2_score_glm = r2_score(y_validate.tax_value, y_validate.tax_value_pred_glm)
print(f"R^2 value for GLM using Tweedie is {round(r2_score_glm,4)}")

RMSE for GLM using Tweedie, power=1 & alpha=0
Training/In-Sample:  321011.7788211258 
Validation/Out-of-Sample:  321011.7788211258
R^2 value for GLM using Tweedie is 0.3155


In [240]:
metric_df = make_metric_df(y_validate.tax_value,
               y_validate.tax_value_pred_glm,
               'glm_poisson',
               metric_df)

In [241]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,384606.346892,0.0
1,mean_baseline,372946.473136,0.0
2,lm_model,293749.778282,0.379614
3,lasso_alpha_1,289881.152809,0.386234
4,glm_poisson,306057.794919,0.315704


In [242]:
metric_df

Unnamed: 0,model,RMSE_validate,r^2_validate
0,median_baseline,384606.346892,0.0
1,mean_baseline,372946.473136,0.0
2,lm_model,293749.778282,0.379614
3,lasso_alpha_1,289881.152809,0.386234
4,glm_poisson,306057.794919,0.315704


In [243]:
# predict on test
y_test['tax_value_pred_lm2'] = lars.predict(X_test_scaled)

# evaluate: rmse
rmse_test = mean_squared_error(y_test.tax_value, y_test.tax_value_pred_lm2) ** (1/2)

print("RMSE for OLS Model using Lasso\nOut-of-Sample Performance: ", rmse_test)
r2_score_lars = r2_score(y_validate.tax_value, y_validate.tax_value_pred_lars)
print(f"R^2 value for Lasso is {round(r2_score_lars,4)}")

RMSE for OLS Model using Lasso
Out-of-Sample Performance:  294683.58454974514
R^2 value for Lasso is 0.3859
