In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.preprocessing import QuantileTransformer, PowerTransformer

from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.linear_model import LinearRegression, TweedieRegressor, LassoLars
from sklearn.metrics import explained_variance_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.preprocessing import QuantileTransformer

from importlib import reload

import warnings
warnings.filterwarnings("ignore")
import modeling as mo
import wrangle as wr
sns.set(rc={'figure.facecolor':'fbf3e4','axes.facecolor':'fbf3e4'})
#sns.set(font_scale=1.5)
pd.options.display.float_format = '{:,.3f}'.format

In [2]:
df = wr.get_zillow()

wr.dummies(df)
X_train, X_validate, X_test, y_train, y_validate, y_test = wr.full_split_zillow(df)

In [3]:
# create 3 data sets that keep the values of the counties
la = df[(df.Orange == 0) & (df.Ventura == 0)] # LA county
ventura = df[df.Ventura == 1] # Ventura county
orange = df[df.Orange == 1] #Orange county
orange = df[df.Orange == 1] # Orange county

In [4]:
baseline = y_train.median()
baseline

361337.0

In [5]:
predictions_train = pd.DataFrame(y_train)
predictions_validate = pd.DataFrame(y_validate)
predictions_train['baseline'] = baseline
predictions_validate['baseline'] = baseline

predictions_validate['baseline'] = baseline

In [6]:
X1, X2, X3 = wr.standard_scale_zillow(X_train, X_validate, X_test)

In [7]:
from sklearn.preprocessing import QuantileTransformer

In [8]:
def scale_zillow_quantile(train, validate, test):
    '''
    accepts train, validate, test data sets
    scales the data in each of them
    returns transformed data sets
    '''
    #count_columns = ['bedroomcnt', 'bathroomcnt']
    
    #col = train.columns[1:-1]
    col = ['bedrooms', 'bathrooms', 'sq_feet', 'lot_sqft', 'house_age']
    
    # create scalers
    #min_max_scaler = MinMaxScaler()    
    qt = QuantileTransformer(output_distribution='normal')
    qt.fit(train[col])
    train[col] = qt.transform(train[col])
    validate[col] = qt.transform(validate[col])
    test[col] = qt.transform(test[col])
    
    return train, validate, test


In [9]:
XQ1, XQ2, XQ3 = scale_zillow_quantile(X_train, X_validate, X_test)

In [10]:
seed = 1349

In [11]:
models = {
    'Linear Regression': LinearRegression(),
    'Generalized Linear Model': TweedieRegressor(power=2, alpha = 0.5),
    'Gradient Boosting Regression': GradientBoostingRegressor(random_state=seed),
    'Decision Tree Regression': DecisionTreeRegressor(max_depth=4, random_state=seed),
    'Random Forest Regression':RandomForestRegressor(max_depth=4, random_state=seed),
    'LassoLars Regression':LassoLars(alpha=0.1)
    }


In [12]:
for key in models:
    print(models[key])

LinearRegression()
TweedieRegressor(alpha=0.5, power=2)
GradientBoostingRegressor(random_state=1349)
DecisionTreeRegressor(max_depth=4, random_state=1349)
RandomForestRegressor(max_depth=4, random_state=1349)
LassoLars(alpha=0.1)


In [13]:
scores = pd.DataFrame(columns=['model_name', 'features', 'scaling',
                               'RMSE_train', 'R2_train', 'RMSE_validate', 'R2_validate', 'RMSE_difference'])

In [14]:
def select_kbest(X, y, k):
    '''
    the function accepts the X_train data set, y_train array and k-number of features to select
    runs the SelectKBest algorithm and returns the list of features to be selected for the modeling
    !KBest doesn't depend on the model
    '''
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()].tolist()

In [15]:
# select features with K-Best algorithm 
select_kbest(X_train, y_train, 3)

['bedrooms', 'bathrooms', 'sq_feet']

In [16]:
f1 = ['bedrooms', 'bathrooms', 'sq_feet']
f2 = ['bedrooms', 'bathrooms']
f3 = ['bedrooms','bathrooms','sq_feet', 'pools']
f4 = ['bathrooms','sq_feet', 'pools']
f5 = ['bedrooms','bathrooms','sq_feet','house_age','pools','Orange','Ventura']
f6 = select_kbest(X_train, y_train, 4)
f7 = X_train.columns.tolist()

# create a dictionary with features
features = {
    'f1':f1,
    'f2':f2,
    'f3':f3,
    'f4':f4,
    'f5':f5,
    'f6':f6,
    'f7':f7
}

In [17]:
def run_model(X_train, X_validate, scaling):
    
    '''
    general function to run models with X_train and X_validate that were scaled
    '''

    for f in features:
        for key in models:
            # create a model
            model = models[key]
            # fit the model
            model.fit(X_train[features[f]], y_train)
            # predictions of the train set
            y_hat_train = model.predict(X_train[features[f]])
            # predictions of the validate set
            y_hat_validate = model.predict(X_validate[features[f]])
            # add train set predictions to the data frame
            predictions_train[key] = y_hat_train
            # add validate set predictions to the data frame
            predictions_validate[key] = y_hat_validate

            # calculate scores train set
            RMSE, R2 = regression_errors(y_train, y_hat_train)
            # calculate scores validation set
            RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
            diff = np.abs(RMSE - RMSE_val)
            
            # add the score results to the scores Data Frame
            scores.loc[len(scores.index)] = [key, f, scaling, RMSE, R2, RMSE_val, R2_val, diff]


In [18]:
def run_model_standard():
    # runs regression models on the X_train scaled with StandardScaler()
    X1, X2, _ = wr.standard_scale_zillow(X_train, X_validate, X_test)
    run_model(X1, X2, 'standard')

In [19]:
def regression_errors(y_actual, y_predicted):
    '''
    this function accepts 
    y: actual results/array
    yhat: predictions/array
    k: feature size/integer
    calculates regression scores based on the baseline being median
    returns RMSE and adjacted R2
    '''
    # root mean squared error score
    RMSE = mean_squared_error(y_actual, y_predicted) ** .5
    # adjucted R^2 score
    ADJR2 = explained_variance_score(y_actual, y_predicted)
    return round(RMSE), round(ADJR2, 2)

In [20]:
run_model_standard()

In [21]:
scores.sort_values(by='RMSE_train')

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
38,Gradient Boosting Regression,f7,standard,270198,0.41,271138,0.4,940
26,Gradient Boosting Regression,f5,standard,271892,0.4,272400,0.39,508
32,Gradient Boosting Regression,f6,standard,277369,0.38,277473,0.37,104
14,Gradient Boosting Regression,f3,standard,281376,0.36,281583,0.35,207
2,Gradient Boosting Regression,f1,standard,281701,0.36,281946,0.35,245
40,Random Forest Regression,f7,standard,282416,0.36,281395,0.35,1021
28,Random Forest Regression,f5,standard,283493,0.35,282556,0.35,937
34,Random Forest Regression,f6,standard,284015,0.35,282890,0.35,1125
39,Decision Tree Regression,f7,standard,284554,0.35,283892,0.34,662
20,Gradient Boosting Regression,f4,standard,284647,0.35,284750,0.34,103


In [22]:
mo.run_model_quantile()

In [23]:
scores.sort_values(by='RMSE_train').head(20)

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
38,Gradient Boosting Regression,f7,standard,270198,0.41,271138,0.4,940
26,Gradient Boosting Regression,f5,standard,271892,0.4,272400,0.39,508
32,Gradient Boosting Regression,f6,standard,277369,0.38,277473,0.37,104
14,Gradient Boosting Regression,f3,standard,281376,0.36,281583,0.35,207
2,Gradient Boosting Regression,f1,standard,281701,0.36,281946,0.35,245
40,Random Forest Regression,f7,standard,282416,0.36,281395,0.35,1021
28,Random Forest Regression,f5,standard,283493,0.35,282556,0.35,937
34,Random Forest Regression,f6,standard,284015,0.35,282890,0.35,1125
39,Decision Tree Regression,f7,standard,284554,0.35,283892,0.34,662
20,Gradient Boosting Regression,f4,standard,284647,0.35,284750,0.34,103


In [24]:
X_train.columns.tolist()[:-3]

['bedrooms', 'bathrooms', 'sq_feet', 'lot_sqft', 'house_age']

In [25]:
def rfe(X, y, k):
    '''
    The function accepts the X_train data set, y_train array and k-number of features to select
    runs the RFE algorithm and returns the list of features to be selected for the modeling
    !RFE depends on the model.
    This function uses Linear regression
    '''
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    return X.columns[rfe.get_support()].tolist()

In [26]:
scores[scores.model_name == 'Generalized Linear Model']


Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
1,Generalized Linear Model,f1,standard,305342,0.25,301841,0.26,3501
7,Generalized Linear Model,f2,standard,330775,0.12,327906,0.12,2869
13,Generalized Linear Model,f3,standard,304657,0.25,301136,0.26,3521
19,Generalized Linear Model,f4,standard,304659,0.25,301139,0.26,3520
25,Generalized Linear Model,f5,standard,304053,0.26,300502,0.26,3551
31,Generalized Linear Model,f6,standard,305328,0.25,301791,0.26,3537
37,Generalized Linear Model,f7,standard,304041,0.26,300490,0.26,3551


In [27]:
regression_errors(y_train, predictions_train.baseline)

(361531, 0.0)

In [28]:
single_corr = X1.iloc[:, :-3].columns.tolist()

In [29]:
single_corr

['bedrooms', 'bathrooms', 'sq_feet', 'lot_sqft', 'house_age']

In [30]:

def select_best_model(scores):
    # select top 20 models based on the RMSE score of the train set
    top_20 = scores.sort_values(by='RMSE_train').head(20)
    # select top 5 models based on the RMSE score of the validate set
    top_5 = top_20.sort_values(by=['RMSE_validate']).head(5)
    # display top 5 models
    display(top_5)
    # select the best model with the smallest difference in the RMSE scores
    best_model = top_5.sort_values(by='RMSE_difference').head(1)
    return best_model

In [31]:
best = select_best_model(scores)

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
38,Gradient Boosting Regression,f7,standard,270198,0.41,271138,0.4,940
26,Gradient Boosting Regression,f5,standard,271892,0.4,272400,0.39,508
32,Gradient Boosting Regression,f6,standard,277369,0.38,277473,0.37,104
40,Random Forest Regression,f7,standard,282416,0.36,281395,0.35,1021
14,Gradient Boosting Regression,f3,standard,281376,0.36,281583,0.35,207


In [32]:
best

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
32,Gradient Boosting Regression,f6,standard,277369,0.38,277473,0.37,104


In [33]:
top_20 = scores.sort_values(by='RMSE_train').head(20)

In [34]:
top_5 = top_20.sort_values(by=['RMSE_validate']).head(5)

In [35]:
top_5

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
38,Gradient Boosting Regression,f7,standard,270198,0.41,271138,0.4,940
26,Gradient Boosting Regression,f5,standard,271892,0.4,272400,0.39,508
32,Gradient Boosting Regression,f6,standard,277369,0.38,277473,0.37,104
40,Random Forest Regression,f7,standard,282416,0.36,281395,0.35,1021
14,Gradient Boosting Regression,f3,standard,281376,0.36,281583,0.35,207


In [36]:
best_model = top_5.sort_values(by='RMSE_difference').head(1)

In [37]:
best_model


Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
32,Gradient Boosting Regression,f6,standard,277369,0.38,277473,0.37,104


In [38]:
def run_best_model():
    '''
    the function runs the best model on the train, test and validate data sets 
    and returns scores in the data frame
    '''
    # create a data frame for test set results
    predictions_test = pd.DataFrame(y_test)
    predictions_test['baseline'] = baseline

    f = f2
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    poly.fit(X1[f])

    # create a df with transformed features of the train set
    X1_poly = pd.DataFrame(
                poly.transform(X1[f]),
                columns=poly.get_feature_names(X1[f].columns),
                index=X1.index)
    X1_poly = pd.concat([X1_poly, X1.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create. df with transformed features for the test set
    X3_poly = pd.DataFrame(
                poly.transform(X3[f]),
                columns=poly.get_feature_names(X3[f].columns),
                index=X3.index)
    X3_poly = pd.concat([X3_poly, X3.iloc[:, 2:]], axis=1)

    # create a Gradient Boosting Regression model
    model = GradientBoostingRegressor()
    # fit the model
    model.fit(X1_poly, y_train)
    # predictions of the train set
    y_hat_train = model.predict(X1_poly)
    # predictions of the validate set
    y_hat_validate = model.predict(X2_poly)
    # add train set predictions to the data frame
    y_hat_test = model.predict(X3_poly)
    predictions_test['predictions'] = y_hat_test

    # calculate scores train set
    RMSE_train, R2_train = regression_errors(y_train, y_hat_train)
    # calculate scores validation set
    RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
    # calculate scores test set
    RMSE_test, R2_test = regression_errors(y_test, y_hat_test)
    RMSE_bl, _ = regression_errors(y_test, predictions_test.baseline)
    
    # save final score into a dictionary
    res = {
        'Features': str(f),
        'RMSE Train Set': RMSE_train,
        'RMSE Validation Set':RMSE_val,
        'RMSE Test Set':RMSE_test,
        'R2 Train Set':R2_train,
        'R2 Validation Set':R2_val,
        'R2 Test':R2_test,
        'Beats a basline by:':str(f'{round((RMSE_bl - RMSE_test) / RMSE_bl * 100, 1)}%')
    }

    # add the score results to the scores Data Frame
    final_test = pd.DataFrame({'Gradient Bosting Regression': list(res.keys()), 'Scores': list(res.values())})

    return final_test

In [39]:
run_best_model()

Unnamed: 0,Gradient Bosting Regression,Scores
0,Features,"['bedrooms', 'bathrooms']"
1,RMSE Train Set,269920
2,RMSE Validation Set,270996
3,RMSE Test Set,273268
4,R2 Train Set,0.410
5,R2 Validation Set,0.400
6,R2 Test,0.380
7,Beats a basline by:,22.9%


In [40]:
run_best_model()

Unnamed: 0,Gradient Bosting Regression,Scores
0,Features,"['bedrooms', 'bathrooms']"
1,RMSE Train Set,269920
2,RMSE Validation Set,270989
3,RMSE Test Set,273263
4,R2 Train Set,0.410
5,R2 Validation Set,0.400
6,R2 Test,0.380
7,Beats a basline by:,22.9%
