In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt

import wrangle as w
import explore as e
import env

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, PolynomialFeatures

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = w.wrangle_zillow()

In [3]:
df.head()

Unnamed: 0,bedrooms,bathrooms,sq_feet,tax_value,year_built,tax_amount,fips
0,4,3.5,3100,1023282,1998,11013.72,6059
1,2,1.0,1465,464000,1967,5672.48,6111
2,3,2.0,1243,564778,1962,6488.3,6059
3,4,3.0,2376,145143,1970,1777.51,6037
4,4,3.0,2962,773303,1950,9516.26,6037


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50790 entries, 0 to 50789
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   bedrooms    50790 non-null  int64  
 1   bathrooms   50790 non-null  float64
 2   sq_feet     50790 non-null  int64  
 3   tax_value   50790 non-null  int64  
 4   year_built  50790 non-null  int64  
 5   tax_amount  50790 non-null  float64
 6   fips        50790 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 2.7 MB


In [5]:
train, validate, test = w.split_data(df)

In [6]:
train_scaled, validate_scaled, test_scaled = w.scale_data(train, validate, test)

In [7]:
train_scaled.head(3)

Unnamed: 0,bedrooms,bathrooms,sq_feet,tax_value,year_built,tax_amount,fips
8096,0.5,0.333333,0.162985,206483,1983,0.036235,6111
45007,0.333333,0.166667,0.152955,110048,1970,0.023914,6037
43774,0.5,0.333333,0.171701,82423,1956,0.018175,6059


In [8]:
train_scaled.dtypes

bedrooms      float64
bathrooms     float64
sq_feet       float64
tax_value       int64
year_built      int64
tax_amount    float64
fips            int64
dtype: object

In [9]:
#x_cols = train_scaled[['bedrooms','bathrooms','sq_feet']]

In [10]:
#y_cols = train_scaled.tax_value

In [11]:
#x_cols.dtypes

In [12]:
#y_cols.dtypes

In [13]:
#x_cols_list = list(x_cols)

In [14]:
#x_cols[x_cols_list[0]]

In [15]:
#for i in x_cols[1:]:
#    print(i)
 #   plt.figure()
  #  plt.plot(y_cols,x_cols[i])
  #  plt.show()

In [16]:
#e.plot_categorical_and_continuous_vars(train_scaled, x_cols,y_cols)

In [17]:
#e.plot_variable_pairs(x_cols, y_cols)

In [18]:
#x_cols['bedrooms']

In [19]:
x_train_scaled = train_scaled[['bedrooms', 'bathrooms', 'sq_feet']]
y_train = train[['tax_value']]

x_validate_scaled = validate_scaled[['bedrooms', 'bathrooms', 'sq_feet']]
y_validate = validate[['tax_value']]

x_test_scaled = test_scaled[['bedrooms', 'bathrooms', 'sq_feet']]
y_test = test[['tax_value']]

# Baseline

In [20]:
def get_baseline(x_train_scaled, y_train):
    y_train['tax_value_pred_mean']= y_train['tax_value'].mean()
    y_validate['tax_value_pred_mean']= y_validate['tax_value'].mean()

    y_train['tax_value_pred_median'] = y_train['tax_value'].median()
    y_validate['tax_value_pred_median'] = y_validate['tax_value'].median()

    rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_mean)**(1/2)
    rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_mean)**(1/2)

    print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
          "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))
    
    rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_median)**(1/2)
    rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_median)**(1/2)

    print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
          "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

In [21]:
get_baseline(x_train_scaled, y_train)

RMSE using Mean
Train/In-Sample:  355371.19 
Validate/Out-of-Sample:  347152.8
RMSE using Median
Train/In-Sample:  364925.59 
Validate/Out-of-Sample:  355040.41


# LinearRegression (OLS)

In [22]:
def get_ols(x_train_scaled, y_train):
    # create the model object
    lm = LinearRegression(normalize=True)

    # fit the model to our training data. We must specify the column in y_train, 
    # since we have converted it to a dataframe from a series! 
    lm.fit(x_train_scaled, y_train.tax_value)

    # predict train
    y_train['tax_value_pred_lm'] = lm.predict(x_train_scaled)

    # evaluate: rmse
    rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lm)**(1/2)

    # predict validate
    y_validate['tax_value_pred_lm'] = lm.predict(x_validate_scaled)

    # evaluate: rmse
    rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_lm)**(1/2)

    print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
          "\nValidation/Out-of-Sample: ", rmse_validate)

In [23]:
get_ols(x_train_scaled, y_train)

RMSE for OLS using LinearRegression
Training/In-Sample:  286205.31265758 
Validation/Out-of-Sample:  281570.92470836977


# Lars

In [24]:
def get_lars(x_train_scaled, y_train):
    lars = LassoLars(alpha=1.0)
    #scaler = MinMaxScaler()
    #x_train_scaled = x_train_scaled.copy()
    #x_train_scaled[['bedrooms','bathrooms','sq_feet','year_built','tax_amount','fips']] = scaler.fit_transform(x_train_scaled)
    #x_validate_scaled = x_validate_scaled.copy()
    #x_validate_scaled[['bedrooms','bathrooms','sq_feet','year_built','tax_amount','fips']] = scaler.fit_transform(x_validate_scaled)

    lars.fit(x_train_scaled, y_train.tax_value)

    y_train['tax_value_pred_lars'] = lars.predict(x_train_scaled)

    # evaluate: rmse
    rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lars)**(1/2)

    # predict validate
    y_validate['tax_value_pred_lars'] = lars.predict(x_validate_scaled)

    # evaluate: rmse
    rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_lars)**(1/2)

    print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
          "\nValidation/Out-of-Sample: ", rmse_validate)

In [25]:
get_lars(x_train_scaled, y_train)

RMSE for Lasso + Lars
Training/In-Sample:  286205.6184335364 
Validation/Out-of-Sample:  281570.55340928794


# TweedieRegressor (GLM)

In [26]:
def get_glm(x_train_scaled, y_train):
    
    # create the model object
    glm = TweedieRegressor(power=1, alpha=0)

    # fit the model to our training data. We must specify the column in y_train, 
    # since we have converted it to a dataframe from a series! 
    glm.fit(x_train_scaled, y_train.tax_value)

    # predict train
    y_train['tax_value_pred_glm'] = glm.predict(x_train_scaled)

    # evaluate: rmse
    rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_glm)**(1/2)

    # predict validate
    y_validate['tax_value_pred_glm'] = glm.predict(x_validate_scaled)

    # evaluate: rmse
    rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_glm)**(1/2)

    print("RMSE for GLM using Tweedie, power=1 & alpha=0\nTraining/In-Sample: ", rmse_train, 
          "\nValidation/Out-of-Sample: ", rmse_validate)

In [27]:
get_glm(x_train_scaled, y_train)

RMSE for GLM using Tweedie, power=1 & alpha=0
Training/In-Sample:  296696.93800469773 
Validation/Out-of-Sample:  288237.9091112238


# Polynomial Regression

In [28]:
def get_poly(x_train_scaled, y_train):
    # make the polynomial features to get a new set of features
    pf = PolynomialFeatures(degree=2)

    # fit and transform X_train_scaled
    x_train_scaled_degree2 = pf.fit_transform(x_train_scaled)

    # transform X_validate_scaled & X_test_scaled
    x_validate_scaled_degree2 = pf.transform(x_validate_scaled)
    x_test_scaled_degree2 = pf.transform(x_test_scaled)
    
    # create the model object
    lm2 = LinearRegression(normalize=True)

    # fit the model to our training data. We must specify the column in y_train, 
    # since we have converted it to a dataframe from a series! 
    lm2.fit(x_train_scaled_degree2, y_train.tax_value)

    # predict train
    y_train['tax_value_pred_lm2'] = lm2.predict(x_train_scaled_degree2)

    # evaluate: rmse
    rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lm2)**(1/2)

    # predict validate
    y_validate['tax_value_pred_lm2'] = lm2.predict(x_validate_scaled_degree2)

    # evaluate: rmse
    rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_lm2)**(1/2)

    print("RMSE for Polynomial Model, degrees=2\nTraining/In-Sample: ", rmse_train, 
          "\nValidation/Out-of-Sample: ", rmse_validate)
    

In [29]:
get_poly(x_train_scaled, y_train)

RMSE for Polynomial Model, degrees=2
Training/In-Sample:  285135.8897185141 
Validation/Out-of-Sample:  280525.2753433108


# Evaluation

In [32]:
x_train_scaled.head()

Unnamed: 0,bedrooms,bathrooms,sq_feet
8096,0.5,0.333333,0.162985
45007,0.333333,0.166667,0.152955
43774,0.5,0.333333,0.171701
1082,0.5,0.25,0.105791
13597,0.666667,1.0,0.587224


In [31]:
y_train.head()

Unnamed: 0,tax_value,tax_value_pred_mean,tax_value_pred_median,tax_value_pred_lm,tax_value_pred_lars,tax_value_pred_glm,tax_value_pred_lm2
8096,206483,445957.819914,363000.0,392758.1,392753.0,377474.6,387092.1
45007,110048,445957.819914,363000.0,382573.7,382330.7,347714.5,390425.5
43774,82423,445957.819914,363000.0,410366.9,410338.7,387057.6,404948.6
1082,373151,445957.819914,363000.0,247753.2,248050.5,297085.9,256523.7
13597,1430885,445957.819914,363000.0,1416496.0,1414726.0,2115100.0,1651991.0


In [None]:
SSE = mean_squared_error(y_train.tax_value, df.yhat)*len(y_train)
SSE_baseline = mean_squared_error(df.y, df.yhat_baseline)*len(df)

#print("SSE manual == SSE sklearn: ", SSE == SSE2) 
#print("SSE manual - baseline == SSE sklearn - baseline: ", SSE_baseline == SSE2_baseline) 