In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor


from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_validate, GridSearchCV, train_test_split, cross_val_score


import warnings
warnings.filterwarnings('ignore')

kf = KFold(n_splits=5, shuffle=True, random_state=42)
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

In [6]:
# remove all outliers
df_all = pd.read_csv('df_all.csv', index_col=0)
y1 = df_all.SalePrice
X1 = df_all.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sale
df_all_normal = pd.read_csv('df_all_normal.csv', index_col=0)
y2 = df_all_normal.SalePrice
X2 = df_all_normal.drop(['SalePrice', 'PID'], axis =1).copy()

# remove Family, AdjLand, and Alloca sales 
df_faa = pd.read_csv('df_faa.csv', index_col=0)
y3 = df_faa.SalePrice
X3 = df_faa.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all non-normal sales and outliers within each quality group
df_normal_quality = pd.read_csv('df_normal_quality.csv', index_col=0)
y4 = df_normal_quality.SalePrice
X4 = df_normal_quality.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all outliers with quality group 
df_quality = pd.read_csv('df_quality.csv', index_col=0)
y5 = df_quality.SalePrice
X5 = df_quality.drop(['SalePrice', 'PID'], axis =1).copy()

# remove all outliers within quality groups then faa groups
df_some_quality = pd.read_csv('df_some_quality.csv', index_col=0)
y6 = df_some_quality.SalePrice
X6 = df_some_quality.drop(['SalePrice', 'PID'], axis =1).copy()

# only remove point over 4000 sf
df = pd.read_csv('df.csv', index_col=0)
y7 = df.SalePrice
X7 = df.drop(['SalePrice', 'PID'], axis =1).copy()

## MLR

In [18]:
#X =  X1.drop(['Bathrooms', 'GrLivArea', 'TotalBsmtSF'], axis =1).copy()

numerical_columns = numerical_columns_selector(X1)
categorical_columns = categorical_columns_selector(X1)

categorical_preprocessor_drop = OneHotEncoder(handle_unknown="ignore", drop= 'first')
numerical_preprocessor = StandardScaler(with_std= False)

preprocessor_drop = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor_drop, categorical_columns),
    ]
)

# pipeline for ols
ols = LinearRegression()
ols_pipe = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Ols", ols)])
ols_regr = TransformedTargetRegressor(regressor= ols_pipe,
                                                func=np.log, inverse_func=np.exp)

In [14]:
scores = cross_val_score(ols_regr, X1, y1, cv = kf, error_score= 'raise')
scores.mean()

0.9009596243609421

In [17]:
# slightly better with dropped colinear columns 
scores = cross_val_score(ols_regr, X, y1, cv = kf, error_score= 'raise')
scores.mean()

0.9088206525671824

# Lasso and Ridge

In [21]:
numerical_columns = numerical_columns_selector(X1)
categorical_columns = categorical_columns_selector(X1)

categorical_preprocessor_drop = OneHotEncoder(handle_unknown="ignore", drop= 'first')
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor_drop = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor_drop, categorical_columns),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)


## Lasso

In [23]:
lasso = Lasso()

lasso_pipe_drop = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Lasso", lasso)])
lasso_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Lasso", lasso)])

lasso_regr_drop = TransformedTargetRegressor(regressor= lasso_pipe_drop,
                                                func=np.log, inverse_func=np.exp)
lasso_regr = TransformedTargetRegressor(regressor= lasso_pipe,
                                                func=np.log, inverse_func=np.exp)
param_grid = {
    "regressor__Lasso__alpha": [.0001, .00015, .0002, .00025, .0003, .00035] 
}

lasso_search_drop = GridSearchCV(lasso_regr_drop, param_grid, n_jobs=2, cv = kf)
lasso_search = GridSearchCV(lasso_regr, param_grid, n_jobs=2, cv = kf)

In [25]:
lasso_search.fit(X1, y1)
lasso_search_drop.fit(X1, y1)

In [28]:
print(lasso_search.best_score_)
print(lasso_search.best_params_)

0.9187821471679719
{'regressor__Lasso__alpha': 0.0003}


In [29]:
print(lasso_search_drop.best_score_)
print(lasso_search_drop.best_params_)

0.9188401986871284
{'regressor__Lasso__alpha': 0.0003}


## Ridge

In [30]:
ridge = Ridge()

ridge_pipe_drop = Pipeline(steps=[("Preprocess", preprocessor_drop), ("Ridge", ridge)])

ridge_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("Ridge", ridge)])


ridge_regr_drop = TransformedTargetRegressor(regressor= ridge_pipe_drop,
                                                func=np.log, inverse_func=np.exp)

ridge_regr = TransformedTargetRegressor(regressor= ridge_pipe,
                                                func=np.log, inverse_func=np.exp)

param_grid = {
    "regressor__Ridge__alpha": [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27] 
}

ridge_search_drop = GridSearchCV(ridge_regr_drop, param_grid, n_jobs=2, cv = kf)

ridge_search = GridSearchCV(ridge_regr, param_grid, n_jobs=2, cv = kf)

In [31]:
ridge_search.fit(X1, y1)
ridge_search_drop.fit(X1, y1)

In [32]:
print(ridge_search.best_score_)
print(ridge_search.best_params_)

0.9177670821603956
{'regressor__Ridge__alpha': 17}


In [33]:
print(ridge_search_drop.best_score_)
print(ridge_search_drop.best_params_)

0.9174213832651763
{'regressor__Ridge__alpha': 15}


## DecisionTreeRegressor

In [39]:
categorical_preprocessor =  OrdinalEncoder(handle_unknown="use_encoded_value", 
                                          unknown_value= -1)
numerical_preprocessor = StandardScaler()

preprocessor_tree = ColumnTransformer(
    [
        ("standard_scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)

regressor = DecisionTreeRegressor()
tree_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("tree", regressor)])
tree_reg = TransformedTargetRegressor(regressor= tree_pipe,
                                                func=np.log, inverse_func=np.exp)

In [40]:
scores = cross_val_score(tree_reg, X1, y1, cv = kf, error_score= 'raise')
scores.mean()

0.753804950262715

## GradientBoostingRegressor

In [44]:
gbr = GradientBoostingRegressor()

gbr_pipe = Pipeline(steps=[("Preprocess", preprocessor_tree), ("boost", gbr)])

gbr_reg = TransformedTargetRegressor(regressor= gbr_pipe,
                                     func=np.log, inverse_func=np.exp)
param_grid = {
    'regressor__boost__n_estimators': [50, 100, 200],
    'regressor__boost__learning_rate': [0.01, 0.1, 0.2],
    'regressor__boost__max_depth': [3, 5, 7],
}

gbr_search = GridSearchCV(gbr_reg, param_grid, n_jobs=2, cv = kf)

In [45]:
gbr_search.fit(X1, y1)

In [46]:
print(gbr_search.best_score_)
print(gbr_search.best_params_)

0.9185986036051178
{'regressor__boost__learning_rate': 0.1, 'regressor__boost__max_depth': 3, 'regressor__boost__n_estimators': 200}


## AdaBoostRegressor 

## RandomForestRegressor

## SVR

## XGBRegressor

## Catboost