In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder, PowerTransformer, QuantileTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn import compose, pipeline
from sklearn.model_selection import KFold, cross_validate, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
import warnings
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
warnings.filterwarnings('ignore')

### Import all data frames from preprocessing

In [69]:
# remove all outliers from data with OverallQual/KitchenQual/ExterQual scaled
df_all = pd.read_csv('dataframes/df_all.csv', index_col=0)
y1 = df_all.SalePrice
X1 = df_all.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sale with OverallQual/KitchenQual/ExterQual scaled
df_all_normal = pd.read_csv('dataframes/df_all_normal.csv', index_col=0)
y2 = df_all_normal.SalePrice
X2 = df_all_normal.drop(['SalePrice', 'PID'], axis =1).copy()

# remove Family, AdjLand, and Alloca sales with OverallQual/KitchenQual/ExterQual scaled 
df_faa = pd.read_csv('dataframes/df_faa.csv', index_col=0)
y3 = df_faa.SalePrice
X3 = df_faa.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sales and outliers within each quality group with OverallQual/KitchenQual/ExterQual scaled
df_normal_quality = pd.read_csv('dataframes/df_normal_quality.csv', index_col=0)
y4 = df_normal_quality.SalePrice
X4 = df_normal_quality.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all outliers with quality group with OverallQual/KitchenQual/ExterQual scaled 
df_quality = pd.read_csv('dataframes/df_quality.csv', index_col=0)
y5 = df_quality.SalePrice
X5 = df_quality.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all outliers within quality groups then faa groups with OverallQual/KitchenQual/ExterQual scaled
df_some_quality = pd.read_csv('dataframes/df_some_quality.csv', index_col=0)
y6 = df_some_quality.SalePrice
X6 = df_some_quality.drop(['SalePrice', 'PID'], axis =1).copy()

# only remove point over 4000 sf with OverallQual/KitchenQual/ExterQual scaled
df = pd.read_csv('dataframes/df.csv', index_col=0)
y7 = df.SalePrice
X7 = df.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sale
df_normal_unscaled = pd.read_csv('dataframes/df_normal_unscaled.csv', index_col= 0)
y8 = df_normal_unscaled.SalePrice
X8 = df_normal_unscaled.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sales and outliers within each quality group
df_normal_quality_unscaled = pd.read_csv('dataframes/df_normal_quality_unscaled.csv', index_col = 0)
y9 = df_normal_quality_unscaled.SalePrice
X9 = df_normal_quality_unscaled.drop(['SalePrice', 'PID'], axis =1).copy()

# only remove point over 4000 sf with low sale price
df_unscaled = pd.read_csv('dataframes/df_unscaled.csv', index_col=0)
y10 = df_unscaled.SalePrice
X10 = df_unscaled.drop(['SalePrice', 'PID'], axis =1).copy()

### Numerical & categorical column selectors used in all algorithms
### kfold cross-validation used in all models

In [17]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)
numerical_columns = numerical_columns_selector(X1)
categorical_columns = categorical_columns_selector(X1)

# ML Algorithms 

### MLR

In [53]:
ols = linear_model.LinearRegression()

X_ols = X1.drop(['Bathrooms', 'GrLivArea', 'TotalBsmtSF'], axis =1).copy()

numerical_columns_ols = numerical_columns_selector(X_ols)
categorical_columns_ols = categorical_columns_selector(X_ols)

categorical_preprocessor_ols = OneHotEncoder(handle_unknown="ignore", drop= 'first')
numerical_preprocessor_ols = StandardScaler(with_std= False)

preprocessor_ols = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor_ols, numerical_columns_ols),
        ("one-hot-encoder", categorical_preprocessor_ols, categorical_columns_ols),
    ]
)

ols_pipe = Pipeline(steps=[("Preprocess", preprocessor_ols), ("Ols", ols)])
ols_regr = compose.TransformedTargetRegressor(regressor= ols_pipe,
                                                func=np.log, inverse_func=np.exp)

ols_scores = cross_val_score(ols_regr, X4, y4, cv = kf, error_score= 'raise')

### scaling and encoding for lasso & ridge 

In [45]:
numerical_columns = numerical_columns_selector(X1)
categorical_columns = categorical_columns_selector(X1)

categorical_preprocessor_drop = OneHotEncoder(handle_unknown="ignore", drop= 'first')
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor_drop = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor_drop, categorical_columns),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)

### Lasso w/ drop and no drop 

In [63]:
# lasso pipe
lasso = linear_model.Lasso()

lasso_pipe_drop = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Lasso", lasso)])
lasso_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Lasso", lasso)])

lasso_regr_drop = compose.TransformedTargetRegressor(regressor= lasso_pipe_drop,
                                                func=np.log, inverse_func=np.exp)

lasso_regr = compose.TransformedTargetRegressor(regressor= lasso_pipe,
                                                func=np.log, inverse_func=np.exp)
param_gridL = {
    "regressor__Lasso__alpha": [.01, .001, .0001, .00001] 
}

lasso_search_drop = GridSearchCV(lasso_regr_drop, param_gridL, n_jobs=2, cv = kf)

lasso_search = GridSearchCV(lasso_regr, param_gridL, n_jobs=2, cv = kf)


### Ridge w/ drop and no drop 

In [64]:

#ridge pipe
ridge = linear_model.Ridge()

ridge_pipe_drop = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Ridge", ridge)])

ridge_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Ridge", ridge)])


ridge_regr_drop = compose.TransformedTargetRegressor(regressor= ridge_pipe_drop,
                                                func=np.log, inverse_func=np.exp)

ridge_regr = compose.TransformedTargetRegressor(regressor= ridge_pipe,
                                                func=np.log, inverse_func=np.exp)

param_gridR = {
    "regressor__Ridge__alpha": [10, 15, 20, 25, 30] 
}

ridge_search_drop = GridSearchCV(ridge_regr_drop, param_gridR, n_jobs=2, cv = kf)

ridge_search = GridSearchCV(ridge_regr, param_gridR, n_jobs=2, cv = kf)

### encoding and scaling tree models 

In [57]:
numerical_columns = numerical_columns_selector(X1)
categorical_columns = categorical_columns_selector(X1)

#categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", drop= 'first')
categorical_preprocessor_tree = OrdinalEncoder(handle_unknown="use_encoded_value", 
                                          unknown_value= -1)
numerical_preprocessor = StandardScaler()

preprocessor_tree = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor_tree, categorical_columns),
        
    ]
)

### Decision Tree

In [42]:
regressor = DecisionTreeRegressor()

tree_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("tree", regressor)])
tree_reg = compose.TransformedTargetRegressor(regressor= tree_pipe,
                                                func=np.log, inverse_func=np.exp)


tree_scores = cross_val_score(tree_reg, X4, y4, cv = kf, error_score= 'raise')

### Gradient boosted regressor  

In [41]:
gbr = GradientBoostingRegressor()

gbr_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("boost", gbr)])

gbr_reg = compose.TransformedTargetRegressor(regressor= gbr_pipe,
                                                func=np.log, inverse_func=np.exp)

param_grid_gbr = {
    "regressor__boost__learning_rate": [.01, .1]
}

gbr_search = GridSearchCV(gbr_reg, param_grid_gbr, n_jobs=2, cv = kf)


# Loop through all models to put r2 in a dataframe

In [65]:
Data = []
ols = []
Ridge = []
Ridge_drop = []
Lasso = []
Lasso_drop = []
Decision_tree = []
gbr = []


A = [X1, X2, X3, X4, X5, X6, X7, X8, X9, X10]
B = [y1, y2, y3, y4, y5, y6, y7, y8, y9, y10]
names = ['all outliers',
        'non-normal sales',
        'family, adjLand, alloca sales',
        'non-normal sales/qg outliers',
        'outliers within quality groups',
        'outliers within quality groups/family, adjLand, alloca',
        'large w/ low sale home',
        'unscaled/non-normal sales',
        'unscaled/non-normal sales/qg outliers', 
        'unscaled/large w/ low sale home']

for a, b, names in zip(A, B, names):
    
    ols_scores = cross_val_score(ols_regr,a, b, cv=kf)
    ridge_search.fit(a,b)
    ridge_search_drop.fit(a,b)
    lasso_search.fit(a,b)
    lasso_search_drop.fit(a,b)
    tree_scores = cross_val_score(tree_reg, a, b, cv = kf, error_score= 'raise')
    gbr_search.fit(a,b)
    
    Data.append(names)
    ols.append(ols_scores.mean())
    Ridge.append(ridge_search.best_score_)
    Ridge_drop.append(ridge_search_drop.best_score_)
    Lasso.append(lasso_search.best_score_)
    Lasso_drop.append(lasso_search_drop.best_score_)
    Decision_tree.append(tree_scores.mean())
    gbr.append(gbr_search.best_score_)
    
    
scores = {}
scores['Removed'] = Data
scores['ols'] = ols
scores['Ridge'] = Ridge
scores['Ridge drop'] = Ridge_drop
scores['Lasso'] = Lasso
scores['Lasso drop'] = Lasso_drop
scores['Decision Tree'] = Decision_tree
scores['gbr'] = gbr

scores_df = pd.DataFrame.from_dict(scores)
scores_df

Unnamed: 0,Removed,ols,Ridge,Ridge drop,Lasso,Lasso drop,Decision Tree,gbr
0,all outliers,0.908821,0.917755,0.917421,0.917349,0.917676,0.75738,0.913803
1,non-normal sales,0.920173,0.930983,0.930232,0.933159,0.932941,0.800426,0.930532
2,"family, adjLand, alloca sales",0.928429,0.935087,0.934058,0.933835,0.933441,0.788852,0.930271
3,non-normal sales/qg outliers,0.945772,0.949691,0.949525,0.949668,0.949628,0.814876,0.935781
4,outliers within quality groups,0.942541,0.946721,0.946525,0.946019,0.946154,0.811073,0.930535
5,"outliers within quality groups/family, adjLand...",0.944359,0.946989,0.946844,0.947096,0.946966,0.832396,0.930951
6,large w/ low sale home,0.923719,0.930288,0.92957,0.930265,0.930171,0.772058,0.92773
7,unscaled/non-normal sales,0.919396,0.929813,0.928854,0.93092,0.93075,0.804583,0.930744
8,unscaled/non-normal sales/qg outliers,0.924632,0.931939,0.932111,0.933936,0.933859,0.764645,0.931094
9,unscaled/large w/ low sale home,0.922351,0.929575,0.928799,0.929533,0.929419,0.790296,0.927981


In [66]:
scores_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Removed,all outliers,non-normal sales,"family, adjLand, alloca sales",non-normal sales/qg outliers,outliers within quality groups,"outliers within quality groups/family, adjLand...",large w/ low sale home,unscaled/non-normal sales,unscaled/non-normal sales/qg outliers,unscaled/large w/ low sale home
ols,0.908821,0.920173,0.928429,0.945772,0.942541,0.944359,0.923719,0.919396,0.924632,0.922351
Ridge,0.917755,0.930983,0.935087,0.949691,0.946721,0.946989,0.930288,0.929813,0.931939,0.929575
Ridge drop,0.917421,0.930232,0.934058,0.949525,0.946525,0.946844,0.92957,0.928854,0.932111,0.928799
Lasso,0.917349,0.933159,0.933835,0.949668,0.946019,0.947096,0.930265,0.93092,0.933936,0.929533
Lasso drop,0.917676,0.932941,0.933441,0.949628,0.946154,0.946966,0.930171,0.93075,0.933859,0.929419
Decision Tree,0.75738,0.800426,0.788852,0.814876,0.811073,0.832396,0.772058,0.804583,0.764645,0.790296
gbr,0.913803,0.930532,0.930271,0.935781,0.930535,0.930951,0.92773,0.930744,0.931094,0.927981


In [67]:
#scores_df.to_csv('dataframes/initial_models.csv')