In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path

from env import host, user, password

import wrangle as w

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score 
from math import sqrt

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.cluster import KMeans
from sklearn.preprocessing import PolynomialFeatures

In [2]:
df=w.wrangle_zillow()

In [3]:
train, validate, test = w.split_data(df)

In [4]:
train, validate, test

(       parcelid  logerror  bathrooms  bedrooms  sq_feet    fips    latitude  \
 40409  14357041 -0.001358        3.5       3.0   2820.0  6059.0  33701815.0   
 23172  11185767  0.044035        4.0       4.0   2764.0  6037.0  34610159.0   
 49707  11471750  0.026075        3.0       4.0   2044.0  6037.0  33924794.0   
 36587  14234304 -0.051115        3.0       4.0   1622.0  6059.0  33795264.0   
 6405   12992951  0.110311        2.0       3.0   1050.0  6037.0  34048144.0   
 ...         ...       ...        ...       ...      ...     ...         ...   
 19982  17160255  0.007457        1.0       3.0   1000.0  6111.0  34205691.0   
 1766   11663538  0.039340        3.0       4.0   2570.0  6037.0  34043592.0   
 5102   12185674  1.191672        2.0       3.0   1292.0  6037.0  33988365.0   
 34399  10731267  0.005839        2.0       3.0   2291.0  6037.0  34181498.0   
 26237  14183578 -0.007056        2.0       4.0   1448.0  6059.0  33883759.0   
 
          longitude  lotsizesquarefeet

In [5]:
train_scaled, validate_scaled, test_scaled=w.scale_data(train, validate, test)

In [6]:
train_scaled.head

<bound method NDFrame.head of        parcelid  logerror  bathrooms  bedrooms   sq_feet    fips  latitude  \
40409  14357041  0.414580   0.416667  0.333333  0.275165  6059.0  0.245398   
23172  11185767  0.422406   0.500000  0.500000  0.269389  6037.0  0.863565   
49707  11471750  0.419310   0.333333  0.500000  0.195132  6037.0  0.397145   
36587  14234304  0.406003   0.333333  0.500000  0.151609  6059.0  0.308994   
6405   12992951  0.433831   0.166667  0.333333  0.092616  6037.0  0.481090   
...         ...       ...        ...       ...       ...     ...       ...   
19982  17160255  0.416100   0.000000  0.333333  0.087459  6111.0  0.588307   
1766   11663538  0.421596   0.333333  0.500000  0.249381  6037.0  0.477992   
5102   12185674  0.620247   0.166667  0.333333  0.117574  6037.0  0.440408   
34399  10731267  0.415821   0.166667  0.333333  0.220606  6037.0  0.571843   
26237  14183578  0.413598   0.166667  0.500000  0.133663  6059.0  0.369219   

       longitude  lotsizesquarefe

In [7]:
train_scaled.head()

Unnamed: 0,parcelid,logerror,bathrooms,bedrooms,sq_feet,fips,latitude,longitude,lotsizesquarefeet,regionidcity,...,taxrate,acres,acres_bin,sqft_bin,structure_dollar_per_sqft,structure_dollar_sqft_bin,land_dollar_per_sqft,lot_dollar_sqft_bin,bath_bed_ratio,cola
40409,14357041,0.41458,0.416667,0.333333,0.275165,6059.0,0.245398,0.923479,0.006859,52650.0,...,1.435365,0.006859,0.1,0.6,0.116026,0.5,0.063539,0.5,0.345238,0
23172,11185767,0.422406,0.5,0.5,0.269389,6037.0,0.863565,0.673858,0.008336,40227.0,...,1.756942,0.008336,0.2,0.6,0.025899,0.1,0.005032,0.2,0.285714,0
49707,11471750,0.41931,0.333333,0.5,0.195132,6037.0,0.397145,0.552532,0.006174,55753.0,...,1.118434,0.006174,0.1,0.5,0.117293,0.5,0.084231,0.5,0.196429,0
36587,14234304,0.406003,0.333333,0.5,0.151609,6059.0,0.308994,0.865966,0.009347,33252.0,...,1.110092,0.009347,0.2,0.4,0.044535,0.2,0.024892,0.3,0.196429,0
6405,12992951,0.433831,0.166667,0.333333,0.092616,6037.0,0.48109,0.782651,0.007353,118895.0,...,1.282477,0.007353,0.1,0.2,0.061962,0.3,0.025591,0.3,0.166667,0


In [8]:
X_train_scaled=train_scaled.drop(columns=['logerror'])
y_train=train_scaled.logerror

In [9]:
X_train_scaled.head()

Unnamed: 0,parcelid,bathrooms,bedrooms,sq_feet,fips,latitude,longitude,lotsizesquarefeet,regionidcity,regionidcounty,...,taxrate,acres,acres_bin,sqft_bin,structure_dollar_per_sqft,structure_dollar_sqft_bin,land_dollar_per_sqft,lot_dollar_sqft_bin,bath_bed_ratio,cola
40409,14357041,0.416667,0.333333,0.275165,6059.0,0.245398,0.923479,0.006859,52650.0,1286.0,...,1.435365,0.006859,0.1,0.6,0.116026,0.5,0.063539,0.5,0.345238,0
23172,11185767,0.5,0.5,0.269389,6037.0,0.863565,0.673858,0.008336,40227.0,3101.0,...,1.756942,0.008336,0.2,0.6,0.025899,0.1,0.005032,0.2,0.285714,0
49707,11471750,0.333333,0.5,0.195132,6037.0,0.397145,0.552532,0.006174,55753.0,3101.0,...,1.118434,0.006174,0.1,0.5,0.117293,0.5,0.084231,0.5,0.196429,0
36587,14234304,0.333333,0.5,0.151609,6059.0,0.308994,0.865966,0.009347,33252.0,1286.0,...,1.110092,0.009347,0.2,0.4,0.044535,0.2,0.024892,0.3,0.196429,0
6405,12992951,0.166667,0.333333,0.092616,6037.0,0.48109,0.782651,0.007353,118895.0,3101.0,...,1.282477,0.007353,0.1,0.2,0.061962,0.3,0.025591,0.3,0.166667,0


In [10]:
X_validate_scaled=validate_scaled.drop(columns=['logerror'])
y_validate=validate_scaled.logerror

In [11]:
X_test_scaled=test_scaled.drop(columns=['logerror'])
y_test=test_scaled.logerror

BASELINE

In [12]:
# Create a baseline prediction by creating a function.
def baseline(y_train, y_validate, y_test):
    # We need y_train and y_validate (and test) to be dataframes to append the new columns with predicted values. 
    y_train = pd.DataFrame(y_train)
    y_validate = pd.DataFrame(y_validate)
    y_test = pd.DataFrame(y_test)

    # 1. Predict home_value_pred_mean
    logerror_pred_mean = y_train['logerror'].mean()
    y_train['logerror_pred_mean'] = logerror_pred_mean
    y_validate['logerror_pred_mean'] = logerror_pred_mean

    # 2. compute home_value_pred_median
    logerror_pred_median = y_train['logerror'].median()
    y_train['logerror_pred_median'] = logerror_pred_median
    y_validate['logerror_pred_median'] = logerror_pred_median

    # 3. RMSE of home_value_pred_mean
    rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_mean)**(1/2)
    rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_mean)**(1/2)

    print("RMSE using Mean\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))

    # 4. RMSE of home_value_pred_median
    rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_median)**(1/2)
    rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_median)**(1/2)

    print("RMSE using Median\nTrain/In-Sample: ", round(rmse_train, 2), 
      "\nValidate/Out-of-Sample: ", round(rmse_validate, 2))
    
    return y_train, y_validate, y_test

In [13]:
y_train, y_validate, y_test

(40409    0.414580
 23172    0.422406
 49707    0.419310
 36587    0.406003
 6405     0.433831
            ...   
 19982    0.416100
 1766     0.421596
 5102     0.620247
 34399    0.415821
 26237    0.413598
 Name: logerror, Length: 28378, dtype: float64,
 31502    0.416827
 30324    0.419311
 5349     0.416318
 14766    0.412996
 37693    0.396787
            ...   
 18312    0.384634
 11011    0.414373
 28978    0.425687
 31520    0.422469
 3202     0.413373
 Name: logerror, Length: 12163, dtype: float64,
 40229    0.409560
 36506    0.411431
 29675    0.423875
 33963    0.414215
 45945    0.420033
            ...   
 5079     0.409047
 27223    0.411558
 43577    0.419821
 15757    0.409310
 6864     0.408224
 Name: logerror, Length: 10136, dtype: float64)

In [17]:
lm = LinearRegression(normalize=True)

NameError: name 'LinearRegression' is not defined

In [14]:
# Create a function for the OLS regression model object
def ols(X_train_scaled, y_train, X_validate_scaled, y_validate):
    lm = LinearRegression(normalize=True)

    # fit the model to our training data. We must specify the column in y_train, 
    # since we have converted it to a dataframe from a series! 
    lm.fit(X_train_scaled, y_train.logerror)

    # predict train
    y_train['logerror_pred_lm'] = lm.predict(X_train_scaled)

    # evaluate: rmse
    rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_lm)**(1/2)

    # predict validate
    y_validate['logerror_pred_lm'] = lm.predict(X_validate_scaled)

    # evaluate: rmse
    rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_lm)**(1/2)

    print("RMSE for OLS\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)
    

In [16]:
ols(X_train_scaled, y_train, X_validate_scaled, y_validate)

NameError: name 'LinearRegression' is not defined

In [None]:

# Create a function for Polynomial Features(selecting the degrees)
# make the polynomial features to get a new set of features
#Change the pf degree to 2 or more)

def poly_features(X_train_scaled,X_validate_scaled,X_test_scaled, n):
    pf = PolynomialFeatures(degree=n)
    # fit and transform X_train_scaled
    X_train_degree= pf.fit_transform(X_train_scaled)

    # transform X_validate_scaled & X_test_scaled
    X_validate_degree = pf.transform(X_validate_scaled)
    X_test_degree = pf.transform(X_test_scaled)
    
    return X_train_degree, X_validate_degree, X_test_degree

In [None]:
# Create a function for the Polynomial regression model object
def polydeg(X_train_degree, y_train, X_validate_degree, y_validate, X_test_degree, y_test):
    # create the model object
    lm = LinearRegression(normalize=True)

    # fit the model to our training data. We must specify the column in y_train, 
    # since we have converted it to a dataframe from a series! 
    lm.fit(X_train_degree, y_train.logerror)

    # predict train
    y_train['logerror_pred_lm'] = lm.predict(X_train_degree)

    # evaluate: rmse
    rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_lm)**(1/2)

    # predict validate
    y_validate['logerror_pred_lm'] = lm.predict(X_validate_degree)

    # evaluate: rmse
    rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_lm)**(1/2)

    # predict on test
    y_test['logerror_pred_lm'] = lm.predict(X_test_degree)

    # evaluate: rmse
    rmse_test = mean_squared_error(y_test.logerror, y_test.logerror_pred_lm)**(1/2)

    print("RMSE for Poly \nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate,
      "\nTesting/Out-of-Sample Performance: ", rmse_test)

In [None]:
#Create a function for the LassoLars regression model object

def lasso_lars(X_train_scaled, X_validate_scaled, X_test_scaled, y_train, y_validate, y_test)
lars = LassoLars(alpha=1.0)

# fit the model to our training data. We must specify the column in y_train, 
# since we have converted it to a dataframe from a series! 
lars.fit(X_train_scaled, y_train.logerror)

# predict train
y_train['logerror_pred_lars'] = lars.predict(X_train_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_lars)**(1/2)

# predict validate
y_validate['logerror_pred_lars'] = lars.predict(X_validate_scaled)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_lars)**(1/2)

# predict test
y_test['logerror_pred_lars'] = lars.predict(X_test_scaled)

# evaluate: rmse
rmse_test = mean_squared_error(y_test.logerror, y_test.logerror_pred_lars)**(1/2)

print("RMSE for Lasso + Lars\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate,
      "\nTesting/Out-of-Sample Performance: ", rmse_test)


In [None]:
#Create a function for the Tweedie regression model object
def Tweedie(X_train_scaled, X_validate_scaled, X_test_scaled, y_train, y_validate, y_test):
    glm = TweedieRegressor(power=0, alpha=1)

    # fit the model to our training data. We must specify the column in y_train, 
    # since we have converted it to a dataframe from a series! 
    glm.fit(X_train_scaled, y_train.logerror)

    # predict train
    y_train['logerror_pred_glm'] = glm.predict(X_train_scaled)

    # evaluate: rmse
    rmse_train = mean_squared_error(y_train.logerror, y_train.logerror_pred_glm)**(1/2)

    # predict validate
    y_validate['logerror_pred_glm'] = glm.predict(X_validate_scaled)

    # evaluate: rmse
    rmse_validate = mean_squared_error(y_validate.logerror, y_validate.logerror_pred_glm)**(1/2)

    # predict test
    y_test['logerror_pred_glm'] = glm.predict(X_test_scaled)
    
    # evaluate: rmse
    rmse_test = mean_squared_error(y_test.logerror, y_test.logerror_pred_glm)**(1/2)
    
    print("RMSE for Tweedie\nTraining/In-Sample: ", rmse_train,
          "\nValidation/Out-of-Sample: ", rmse_validate,
          "\nTesting/Out-of-Sample Performance: ", rmse_test)

In [None]:
# plot to visualize actual vs predicted. 
plt.figure(figsize=(16,8))
plt.hist(y_validate.logerror, color='red', alpha=.5, label="Absolute Value Log Error")
plt.hist(y_validate.logerror_pred_lm, color='blue', alpha=.5, label="Validate")
plt.hist(y_test.logerror_pred_lm, color='green', alpha=.5, label='Test')
plt.xlabel("Log Error")
plt.ylabel("Homes")
plt.title("Comparing the Distribution of Absolute Value Log Error to Predicted w/Validate and w/Test")
plt.legend()
plt.show()