In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, RandomizedSearchCV, cross_validate, GridSearchCV, train_test_split, cross_val_score


import warnings
warnings.filterwarnings('ignore')

kf = KFold(n_splits=5, shuffle=True, random_state=42)
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

In [129]:
# remove all outliers from data with OverallQual/KitchenQual/ExterQual scaled
df_all = pd.read_csv('df_all.csv', index_col=0)
y1 = df_all.SalePrice
X1 = df_all.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sale with OverallQual/KitchenQual/ExterQual scaled
df_all_normal = pd.read_csv('df_all_normal.csv', index_col=0)
y2 = df_all_normal.SalePrice
X2 = df_all_normal.drop(['SalePrice', 'PID'], axis =1).copy()

# remove Family, AdjLand, and Alloca sales with OverallQual/KitchenQual/ExterQual scaled 
df_faa = pd.read_csv('df_faa.csv', index_col=0)
y3 = df_faa.SalePrice
X3 = df_faa.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sales and outliers within each quality group with OverallQual/KitchenQual/ExterQual scaled
df_normal_quality = pd.read_csv('df_normal_quality.csv', index_col=0)
y4 = df_normal_quality.SalePrice
X4 = df_normal_quality.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all outliers with quality group with OverallQual/KitchenQual/ExterQual scaled 
df_quality = pd.read_csv('df_quality.csv', index_col=0)
y5 = df_quality.SalePrice
X5 = df_quality.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all outliers within quality groups then faa groups with OverallQual/KitchenQual/ExterQual scaled
df_some_quality = pd.read_csv('df_some_quality.csv', index_col=0)
y6 = df_some_quality.SalePrice
X6 = df_some_quality.drop(['SalePrice', 'PID'], axis =1).copy()

# only remove point over 4000 sf with OverallQual/KitchenQual/ExterQual scaled
df = pd.read_csv('df.csv', index_col=0)
y7 = df.SalePrice
X7 = df.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sale
df_normal_unscaled = pd.read_csv('df_normal_unscaled.csv', index_col= 0)
y8 = df_normal_unscaled.SalePrice
X8 = df_normal_unscaled.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sales and outliers within each quality group
df_normal_quality_unscaled = pd.read_csv('df_normal_quality_unscaled.csv', index_col = 0)
y9 = df_normal_quality_unscaled.SalePrice
X9 = df_normal_quality_unscaled.drop(['SalePrice', 'PID'], axis =1).copy()

# only remove point over 4000 sf
df_unscaled = pd.read_csv('df_unscaled.csv', index_col=0)
y10 = df_unscaled.SalePrice
X10 = df_unscaled.drop(['SalePrice', 'PID'], axis =1).copy()

## MLR

In [150]:
# need to drop colinear columns 
df_ols = X7.copy()
X = df_ols.drop(['Bathrooms', 'GrLivArea', 'TotalBsmtSF'], axis =1).copy()

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

categorical_preprocessor_drop = OneHotEncoder(handle_unknown="ignore", drop= 'first')
numerical_preprocessor = StandardScaler(with_std= False)

preprocessor_drop = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor_drop, categorical_columns),
    ]
)

# pipeline for ols
ols = LinearRegression()
ols_pipe = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Ols", ols)])
ols_regr = TransformedTargetRegressor(regressor= ols_pipe,
                                                func=np.log, inverse_func=np.exp)

In [151]:
scores = cross_val_score(ols_regr, X, y7, cv = kf, error_score= 'raise')
scores.mean()

0.9237188186087124

# Lasso and Ridge

In [162]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

numerical_columns = numerical_columns_selector(X1)
categorical_columns = categorical_columns_selector(X1)

categorical_preprocessor_drop = OneHotEncoder(handle_unknown="ignore", drop= 'first')
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
#categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", 
                                         # unknown_value= -1)

numerical_preprocessor = StandardScaler()

preprocessor_drop = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor_drop, categorical_columns),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)


## Lasso

In [163]:
lasso = Lasso()

lasso_pipe_drop = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Lasso", lasso)])
lasso_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Lasso", lasso)])

lasso_regr_drop = TransformedTargetRegressor(regressor= lasso_pipe_drop,
                                                func=np.log, inverse_func=np.exp)
lasso_regr = TransformedTargetRegressor(regressor= lasso_pipe,
                                                func=np.log, inverse_func=np.exp)
param_grid = {
    "regressor__Lasso__alpha": [.00028] 
}

lasso_search_drop = GridSearchCV(lasso_regr_drop, param_grid, n_jobs=2, cv = kf)
lasso_search = GridSearchCV(lasso_regr, param_grid, n_jobs=2, cv = kf)

In [187]:
lasso_search.fit(X4, y4)
#lasso_search_drop.fit(X4, y4)

In [188]:
print(lasso_search.best_score_)
print(lasso_search.best_params_)

0.9502966204079053
{'regressor__Lasso__alpha': 0.00028}


In [26]:
print(lasso_search_drop.best_score_)
print(lasso_search_drop.best_params_)

0.9500297725570256
{'regressor__Lasso__alpha': 0.00028}


In [200]:
# mean amount off from the actual price based on best model 
print((prediction - y7).sum())
print((prediction - y7).mean())
print((prediction - y7).median())
print((prediction - y7).std())

-1551924.4001282733
-601.9877424857538
-157.96995377908752
20825.837691241068


In [207]:
# mean amount off from the actual price based on best model 
print((prediction - y4).sum())
print((prediction - y4).mean())
print((prediction - y4).std())

-1151971.6865328336
-493.7726903269754
14285.700789066408


In [206]:
prediction = lasso_search.predict(X4)

In [201]:
import plotly.express as px


fig = px.scatter(x= y7, 
                 y = (prediction - y7))

fig.show()

In [204]:

fig = px.scatter(x= y7, 
                 y = (xgb_predictions - y7))

fig.show()

## Ridge

In [100]:
ridge = Ridge()

ridge_pipe_drop = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Ridge", ridge)])

ridge_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Ridge", ridge)])


ridge_regr_drop = TransformedTargetRegressor(regressor= ridge_pipe_drop,
                                                func=np.log, inverse_func=np.exp)

ridge_regr = TransformedTargetRegressor(regressor= ridge_pipe,
                                                func=np.log, inverse_func=np.exp)

param_grid = {
    "regressor__Ridge__alpha": [0] 
}

ridge_search_drop = GridSearchCV(ridge_regr_drop, param_grid, n_jobs=2, cv = kf)

ridge_search = GridSearchCV(ridge_regr, param_grid, n_jobs=2, cv = kf)

In [101]:
ridge_search.fit(X1, y1)
ridge_search_drop.fit(X1, y1)

In [102]:
print(ridge_search.best_score_)
print(ridge_search.best_params_)

nan
{'regressor__Ridge__alpha': 0}


In [None]:
print(ridge_search_drop.best_score_)
print(ridge_search_drop.best_params_)

## DecisionTreeRegressor

In [8]:
categorical_preprocessor =  OrdinalEncoder(handle_unknown="use_encoded_value", 
                                          unknown_value= -1)
numerical_preprocessor = StandardScaler()

preprocessor_tree = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)

regressor = DecisionTreeRegressor()
tree_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("tree", regressor)])
tree_reg = TransformedTargetRegressor(regressor= tree_pipe,
                                                func=np.log, inverse_func=np.exp)

In [None]:
scores = cross_val_score(tree_reg, X1, y1, cv = kf, error_score= 'raise')
scores.mean()

## GradientBoostingRegressor

In [9]:
gbr = GradientBoostingRegressor()

gbr_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("boost", gbr)])

gbr_reg = TransformedTargetRegressor(regressor= gbr_pipe,
                                     func=np.log, inverse_func=np.exp)
param_grid = {
    'regressor__boost__n_estimators': [50, 100, 200],
    'regressor__boost__learning_rate': [0.01, 0.1, 0.2],
    'regressor__boost__max_depth': [3, 5, 7],
    }

gbr_search = GridSearchCV(gbr_reg, param_grid, n_jobs=2, cv = kf)

In [None]:
gbr_search.fit(X1, y1)

In [None]:
print(gbr_search.best_score_)
print(gbr_search.best_params_)

## AdaBoostRegressor 

In [10]:
ada = AdaBoostRegressor()

ada_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("AdaBoost", ada)])

ada_regr = TransformedTargetRegressor(regressor= ada_pipe,
                                     func=np.log, inverse_func=np.exp)
param_grid = {
    'regressor__AdaBoost__learning_rate': [.0001, .01, 1],
    'regressor__AdaBoost__loss': ['linear', 'square', 'expoential'],
    'regressor__AdaBoost__n_estimators': [10, 100, 500]
    }

ada_search = GridSearchCV(ada_regr, param_grid, n_jobs=2, cv = kf)

In [None]:
ada_search.fit(X1, y1)

## RandomForestRegressor

In [94]:
rf = RandomForestRegressor(max_depth=10, n_estimators = 100)

rf_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("RandomForest", rf)])

rf_regr = TransformedTargetRegressor(regressor= rf_pipe,
                                     func=np.log, inverse_func=np.exp)
# param_grid = {
#     'regressor__RandomForest__bootstrap': Categorical([True, False]),
#     'regressor__RandomForest__min_samples_leaf': Integer(2,4),
#     'regressor__RandomForest__min_samples_split': Integer(2,5),
#     'regressor__RandomForest__n_estimators': Integer(200,500)
#     }

param_grid = {
    'regressor__RandomForest__bootstrap': [True, False],
    'regressor__RandomForest__max_depth': [10, None],
    'regressor__RandomForest__max_features': [None, 'sqrt'],
    'regressor__RandomForest__min_samples_leaf': [2, 4],
    'regressor__RandomForest__min_samples_split': [2, 5],
    'regressor__RandomForest__n_estimators': [200, 600]
    }

#param_grid = {
#     'regressor__RandomForest__bootstrap': Categorical([True, False]),
#     'regressor__RandomForest__min_samples_leaf': Integer(2,4),
#     'regressor__RandomForest__min_samples_split': Integer(2,5),
#     'regressor__RandomForest__n_estimators': Integer(200,500)
#     }


rf_search = RandomizedSearchCV(rf_regr, param_grid, n_jobs=2, cv = kf, random_state = 42)

In [92]:
scores = cross_val_score(rf_regr, X1, y1, cv = kf, error_score= 'raise')
scores.mean()

0.8816206968392664

In [95]:
rf_search.fit(X4, y4)

In [96]:
print(rf_search.best_score_)
print(rf_search.best_params_)

0.9152144431751849
{'regressor__RandomForest__n_estimators': 200, 'regressor__RandomForest__min_samples_split': 5, 'regressor__RandomForest__min_samples_leaf': 2, 'regressor__RandomForest__max_features': 'sqrt', 'regressor__RandomForest__max_depth': None, 'regressor__RandomForest__bootstrap': False}


## SVR

## XGBRegressor

In [13]:
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore", drop='first')

numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)

xgb =  XGBRegressor()

xgb_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("XGB", xgb)])

xgb_regr = TransformedTargetRegressor(regressor= xgb_pipe,
                                                 func=np.log, inverse_func=np.exp)

param_grid = {
    "regressor__XGB__max_depth": [3],
    "regressor__XGB__learning_rate": [0.03948737640519406],
    "regressor__XGB__subsample": [0.3203617673528298],
    "regressor__XGB__colsample_bytree": [0.9678010901635519],
    "regressor__XGB__n_estimators": [630, 1000],
    "regressor__XGB__min_child_weight": [2]
}

xgb_search = GridSearchCV(xgb_regr, param_grid, n_jobs=2, cv = kf)

In [192]:
xgb_search.fit(X4, y4)

In [193]:
print(xgb_search.best_score_)
print(xgb_search.best_params_)

0.9529154015288853
{'regressor__XGB__colsample_bytree': 0.9678010901635519, 'regressor__XGB__learning_rate': 0.03948737640519406, 'regressor__XGB__max_depth': 3, 'regressor__XGB__min_child_weight': 2, 'regressor__XGB__n_estimators': 1000, 'regressor__XGB__subsample': 0.3203617673528298}


In [205]:
xgb_predictions = xgb_search.predict(X4)
print((xgb_predictions - y4).sum())
print((xgb_predictions - y4).median())
print((xgb_predictions - y4).std())

-771029.3203125
-90.234375
8564.416625411048


In [198]:
list(xgb_predictions - y7)

[-1874.8125,
 -5801.96875,
 -7382.09375,
 -4318.375,
 -14198.71875,
 1717.203125,
 -3458.5234375,
 -19111.0,
 -3599.734375,
 3106.65625,
 -9266.953125,
 -1184.859375,
 7485.6875,
 10731.515625,
 4261.53125,
 18778.5,
 2434.890625,
 -2677.43359375,
 7401.03125,
 6857.390625,
 7877.546875,
 -14038.3359375,
 4662.0546875,
 -11695.21875,
 -6188.40625,
 -3933.515625,
 12928.15625,
 -5663.40625,
 16244.171875,
 11755.78125,
 3046.640625,
 4290.078125,
 -15142.1875,
 -4427.15625,
 -5779.234375,
 -5787.65625,
 11901.09375,
 4349.8125,
 -3409.703125,
 -4721.5703125,
 10968.5625,
 10099.828125,
 -4579.828125,
 -5090.15625,
 -8591.953125,
 -17924.5625,
 375.84375,
 896.65625,
 -7848.90625,
 5643.671875,
 -2383.59375,
 28936.234375,
 26469.140625,
 5348.28125,
 -41028.734375,
 -1395.453125,
 -98626.21875,
 -3081.65625,
 -33465.9375,
 -5994.296875,
 -3707.0234375,
 -3841.3125,
 6769.515625,
 65284.125,
 -1220.7890625,
 -28794.125,
 -2264.015625,
 -23593.375,
 4959.859375,
 -26474.046875,
 -652.75,


In [83]:
df = pd.read_csv('df.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [84]:
xgb_search.fit(X, y)

In [70]:
# unscaled with normal and quality outliers removed
print(xgb_search.best_score_)
print(xgb_search.best_params_)

0.9489149361799243
{'regressor__XGB__colsample_bytree': 0.9678010901635519, 'regressor__XGB__learning_rate': 0.03948737640519406, 'regressor__XGB__max_depth': 3, 'regressor__XGB__min_child_weight': 2, 'regressor__XGB__n_estimators': 1000, 'regressor__XGB__subsample': 0.3203617673528298}


In [79]:
# unscaled with one outlier removed
print(xgb_search.best_score_)
print(xgb_search.best_params_)

0.9434544433357164
{'regressor__XGB__colsample_bytree': 0.9678010901635519, 'regressor__XGB__learning_rate': 0.03948737640519406, 'regressor__XGB__max_depth': 3, 'regressor__XGB__min_child_weight': 2, 'regressor__XGB__n_estimators': 1000, 'regressor__XGB__subsample': 0.3203617673528298}


In [82]:
# all outliers removed and scaled
print(xgb_search.best_score_)
print(xgb_search.best_params_)

0.9292738626029937
{'regressor__XGB__colsample_bytree': 0.9678010901635519, 'regressor__XGB__learning_rate': 0.03948737640519406, 'regressor__XGB__max_depth': 3, 'regressor__XGB__min_child_weight': 2, 'regressor__XGB__n_estimators': 1000, 'regressor__XGB__subsample': 0.3203617673528298}


In [85]:
# one outlier removed and scaled 
print(xgb_search.best_score_)
print(xgb_search.best_params_)

0.9437478132426733
{'regressor__XGB__colsample_bytree': 0.9678010901635519, 'regressor__XGB__learning_rate': 0.03948737640519406, 'regressor__XGB__max_depth': 3, 'regressor__XGB__min_child_weight': 2, 'regressor__XGB__n_estimators': 1000, 'regressor__XGB__subsample': 0.3203617673528298}


## Catboost