In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt

import mirz_wrangle as w
import explore_module as e

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = w.wrangle_zillow()

In [3]:
train, validate, test = w.split_data(df)

In [4]:
x_train = train[['bedrooms', 'bathrooms', 'sq_feet', 'year_built', 'tax_amount', 'fips']]
y_train = train[['tax_value']]

x_validate = validate[['bedrooms', 'bathrooms', 'sq_feet', 'year_built', 'tax_amount', 'fips']]
y_validate = validate[['tax_value']]

x_test = test[['bedrooms', 'bathrooms', 'sq_feet', 'year_built', 'tax_amount', 'fips']]
y_test = test[['tax_value']]

# Baseline

> Compute mean of target var

In [5]:
y_train['tax_value_pred_mean'] = y_train['tax_value'].mean()
y_validate['tax_value_pred_mean'] = y_validate['tax_value'].mean()

In [6]:
y_train.head()

Unnamed: 0,tax_value,tax_value_pred_mean
1113544,825572,398512.769647
1301161,313005,398512.769647
1609877,459841,398512.769647
1431804,250021,398512.769647
138504,282000,398512.769647


> Compute median

In [7]:
y_train['tax_value_pred_median'] = y_train['tax_value'].median()
y_validate['tax_value_pred_median'] = y_validate['tax_value'].median()

In [8]:
y_train.head()

Unnamed: 0,tax_value,tax_value_pred_mean,tax_value_pred_median
1113544,825572,398512.769647,322085.0
1301161,313005,398512.769647,322085.0
1609877,459841,398512.769647,322085.0
1431804,250021,398512.769647,322085.0
138504,282000,398512.769647,322085.0


> Compute RMSE comparing actual tax_value to tax_value_pred_mean

In [9]:
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_mean)**(1/2)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_mean)**(1/2)

print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))


RMSE using Mean
Train/In-Sample:  313466.41 
Validate/Out-of-Sample:  312815.72


> Compute RMSE comparing actual tax_value to tax_value_pred_median

In [10]:
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_median)**(1/2)
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_median)**(1/2)

print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

RMSE using Median
Train/In-Sample:  322649.03 
Validate/Out-of-Sample:  321978.0


In [11]:
#scaler = MinMaxScaler()
#y_train_scaled = y_train.copy()
#y_train_scaled[['tax_value', 'tax_value_pred_mean', 'tax_value_pred_median']] = scaler.fit_transform(y_train[['tax_value','tax_value_pred_mean','tax_value_pred_median']])

In [12]:
#plt.hist(y_train_scaled.tax_value, color='blue', alpha=.5, label="Actual tax_value")
#plt.hist(y_train_scaled.tax_value_pred_mean, bins=1, color='red', alpha=.5, rwidth=100, label="Predicted tax_value - Mean")
#plt.hist(y_train_scaled.tax_value_pred_median, bins=1, color='orange', alpha=.5, rwidth=100, label="Predicted tax_value - Median")
#plt.xlabel("tax_value")
#plt.ylabel("Number of Properties")
#plt.legend()
#plt.show()

# Lars

In [13]:
lars = LassoLars(alpha=1.0)
scaler = MinMaxScaler()
x_train_scaled = x_train.copy()
x_train_scaled[['bedrooms','bathrooms','sq_feet','year_built','tax_amount','fips']] = scaler.fit_transform(x_train_scaled)
x_validate_scaled = x_validate.copy()
x_validate_scaled[['bedrooms','bathrooms','sq_feet','year_built','tax_amount','fips']] = scaler.fit_transform(x_validate_scaled)

lars.fit(x_train_scaled, y_train.tax_value)

y_train['tax_value_pred_lars'] = lars.predict(x_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.tax_value, y_train.tax_value_pred_lars)**(1/2)

# predict validate
y_validate['tax_value_pred_lars'] = lars.predict(x_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.tax_value, y_validate.tax_value_pred_lars)**(1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)

RMSE for Lasso + Lars
Training/In-Sample:  65641.72190641008 
Validation/Out-of-Sample:  263442.5593729214


In [14]:
#x_train_scaled.head()

In [15]:
x_train

Unnamed: 0,bedrooms,bathrooms,sq_feet,year_built,tax_amount,fips
1113544,2,2.0,1906,1951,9996.51,6037
1301161,4,2.0,1481,1968,4461.72,6037
1609877,4,3.0,2137,1987,5703.07,6037
1431804,3,1.0,1016,1957,3214.53,6037
138504,3,3.0,1920,1988,3222.18,6037
...,...,...,...,...,...,...
1309513,3,3.0,2438,2005,3077.97,6037
133345,3,2.5,1877,1966,2507.74,6059
550414,4,2.0,1656,1964,4723.19,6037
967029,3,2.0,3789,1941,20154.41,6037


In [16]:
pd.DataFrame(x_train_scaled)

Unnamed: 0,bedrooms,bathrooms,sq_feet,year_built,tax_amount,fips
1113544,0.333333,0.333333,0.194150,0.697674,0.099467,0.000000
1301161,0.666667,0.333333,0.150836,0.776744,0.044321,0.000000
1609877,0.666667,0.500000,0.217693,0.865116,0.056690,0.000000
1431804,0.500000,0.166667,0.103445,0.725581,0.031895,0.000000
138504,0.500000,0.500000,0.195577,0.869767,0.031971,0.000000
...,...,...,...,...,...,...
1309513,0.500000,0.500000,0.248369,0.948837,0.030534,0.000000
133345,0.500000,0.416667,0.191194,0.767442,0.024853,0.297297
550414,0.666667,0.333333,0.168671,0.758140,0.046927,0.000000
967029,0.500000,0.333333,0.386058,0.651163,0.200676,0.000000


In [17]:
feature_selection.select_kbest(x_train_scaled, y_train, 3)

NameError: name 'feature_selection' is not defined

In [18]:
def selectkbest(pred_vars, target_var, k_features):
        kbest = SelectKBest(f_regression, k=k_features)
        _ = kbest.fit(pred_vars, target_var)
        kbest_results = pd.DataFrame(dict(p=kbest.pvalues_,f=kbest.scores_),
                            index=pred_vars.columns)
        pred_vars_transformed = pd.DataFrame(
        kbest.transform(pred_vars),
        columns=pred_vars.columns[kbest.get_support()],
        index=pred_vars.index)
        return pred_vars_transformed

In [19]:
selectkbest(x_train_scaled, y_train.tax_value, 3)

Unnamed: 0,bathrooms,sq_feet,tax_amount
1113544,0.333333,0.194150,0.099467
1301161,0.333333,0.150836,0.044321
1609877,0.500000,0.217693,0.056690
1431804,0.166667,0.103445,0.031895
138504,0.500000,0.195577,0.031971
...,...,...,...
1309513,0.500000,0.248369,0.030534
133345,0.416667,0.191194,0.024853
550414,0.333333,0.168671,0.046927
967029,0.333333,0.386058,0.200676


In [20]:
x_train_scaled

Unnamed: 0,bedrooms,bathrooms,sq_feet,year_built,tax_amount,fips
1113544,0.333333,0.333333,0.194150,0.697674,0.099467,0.000000
1301161,0.666667,0.333333,0.150836,0.776744,0.044321,0.000000
1609877,0.666667,0.500000,0.217693,0.865116,0.056690,0.000000
1431804,0.500000,0.166667,0.103445,0.725581,0.031895,0.000000
138504,0.500000,0.500000,0.195577,0.869767,0.031971,0.000000
...,...,...,...,...,...,...
1309513,0.500000,0.500000,0.248369,0.948837,0.030534,0.000000
133345,0.500000,0.416667,0.191194,0.767442,0.024853,0.297297
550414,0.666667,0.333333,0.168671,0.758140,0.046927,0.000000
967029,0.500000,0.333333,0.386058,0.651163,0.200676,0.000000
