# Data Preprocessing

In [None]:
# !pip install pycaret
# !pip install catboost xgboost optuna

# !pip install --force-reinstall threadpoolctl 
# ^ This WORKED in fixing the "knn_impute()" - related error

In [None]:
import numpy as np 
import pandas as pd 
pd.options.display.max_columns = 500 
pd.options.display.max_rows = 10

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid') 

from sklearn.neighbors import KNeighborsRegressor 
import scipy.stats
from sklearn.preprocessing import StandardScaler 
from pycaret.regression import setup, compare_models 
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score 
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge, HuberRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

In [None]:
train0 = pd.read_csv('/Users/joshuaconde/GitHub-REPOSITORIES/Kaggle-HousePricePredictions/HousePricePredictions/train.csv')
test0 = pd.read_csv('/Users/joshuaconde/GitHub-REPOSITORIES/Kaggle-HousePricePredictions/HousePricePredictions/test.csv')

sample_submission = pd.read_csv('/Users/joshuaconde/GitHub-REPOSITORIES/Kaggle-HousePricePredictions/HousePricePredictions/sample_submission.csv')

In [None]:
train0

In [None]:
test0

In [None]:
sample_submission

## 1. DataFrame Combination

In [None]:
target = train0['SalePrice']
test_ids = test0['Id']

train1 = train0.drop(['Id', 'SalePrice'], axis=1) 
test1 = test0.drop('Id', axis=1)

data0 = pd.concat([train1, test1], axis=0).reset_index(drop=True)

data0

## 2. Data Cleaning

In [None]:
data1 = data0.copy()

### 2A. Ensure Proper Data Types

In [None]:
data1['MSSubClass'] = data1['MSSubClass'].astype(str) 

### 2B. Fill Ordinal AND Regular Categorical Missing Values

In [None]:
for column in [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
]: data1[column] = data1[column].fillna("N")

for column in [
    'MSZoning',
    'Utilities',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Electrical',
    'KitchenQual',
    'Functional',
    'SaleType'
]: data1[column] = data1[column].fillna(data1[column].mode()[0])

### 2C. Fill Numeric Missing Values

In [None]:
def knn_impute(df, na_target):
    df = df.copy()
    
    numeric_df = df.select_dtypes(np.number) 
    non_na_columns = numeric_df.loc[ : , numeric_df.isna().sum() == 0].columns 
    
    X_train = numeric_df.loc[numeric_df[na_target].isna() == False, non_na_columns]
    y_train = numeric_df.loc[numeric_df[na_target].isna() == False, na_target]
   
    X_test = numeric_df.loc[numeric_df[na_target].isna() == True, non_na_columns]
    
    knn = KNeighborsRegressor()
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    df.loc[df[na_target].isna() == True, na_target] = y_pred
    
    return df

In [None]:
for column in [
    'LotFrontage',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea'
]: data1 = knn_impute(data1, column)

In [None]:
data1.isna().sum()

## 3. Feature Engineering

In [None]:
data2 = data1.copy()

data2["SqFtPerRoom"] = ( data2["GrLivArea"] / (data2["TotRmsAbvGrd"] +
                         data2["FullBath"] +
                         data2["HalfBath"] +
                         data2["KitchenAbvGr"]) )

data2['Total_Home_Quality'] = data1['OverallQual'] + data1['OverallCond']

data2['Total_Bathrooms'] = (data2['FullBath'] + (0.5 * data2['HalfBath']) +
                            data2['BsmtFullBath'] + (0.5 * data2['BsmtHalfBath']))

data2["HighQualSF"] = data2["1stFlrSF"] + data2["2ndFlrSF"]

data2.columns

## 4. Feature Transformations -> .log1p()

### 4A. Log Transform(ation) for Skewed Features

In [None]:
data3 = data2.copy()

skewed_df = pd.DataFrame(data3.select_dtypes(np.number).columns, columns=['Feature'])
skewed_df['Skew'] = skewed_df['Feature'].apply(lambda feature: scipy.stats.skew(data3[feature]))
skewed_df['Absolute Skew'] = skewed_df['Skew'].apply(abs)
skewed_df['Skewed'] = skewed_df['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False) 

skewed_df

In [None]:
for column in skewed_df.query("Skewed == True")['Feature'].values:
    data3[column] = np.log1p(data3[column])

### 4B. Cosine Transform(ation) for Cyclical Features

In [None]:
data2['MoSold'] = (-np.cos(0.5236 * data3['MoSold']))

data2['MoSold']

## 5. Categorical Encoding

In [None]:
data4 = data3.copy()

data4 = pd.get_dummies(data4)

data4 

## 6. Numeric Scaling

In [None]:
data5 = data4.copy()

scaler = StandardScaler()
scaler.fit(data5)

data5 = pd.DataFrame(scaler.transform(data5), index=data5.index, columns=data5.columns)

data5

## 7. Target Transformation -> .log()

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.distplot(target, kde=True, fit=scipy.stats.norm)
plt.title("Without Log Transform")

plt.subplot(1, 2, 2)
sns.distplot(np.log(target), kde=True, fit=scipy.stats.norm)
plt.xlabel("Log SalePrice")
plt.title("With Log Transform")

plt.show()

In [None]:
log_target = np.log(target)

## 8. Data Splitting

In [None]:
train_final = data5.loc[:train0.index.max(), :].copy()
test_final = data5.loc[train0.index.max() + 1:, :].reset_index(drop=True).copy()

In [None]:
train_final

In [None]:
test_final

# Building an Ensemble Model using "Bagging" (Bootstrap Aggregation)

In [None]:
catboost_params = {
    'iterations': 6000,
    'learning_rate': 0.005,
    'depth': 4,
    'l2_leaf_reg': 1,
    'eval_metric':'RMSE',
    'early_stopping_rounds': 200,
    'random_seed': 42
}

br_params = {
    'n_iter': 304,
    'tol': 0.16864712769300896,
    'alpha_1': 5.589616542154059e-07,
    'alpha_2': 9.799343618469923,
    'lambda_1': 1.7735725582463822,
    'lambda_2': 3.616928181181732e-06
}

lightgbm_params = {
    'num_leaves': 39,
    'max_depth': 2,
    'learning_rate': 0.13705339989856127,
    'n_estimators': 273,
}

"""
gbr_params = {
    'loss': 'absolute_error',
    'learning_rate': 0.11054616145386358,
    'n_estimators': 266,
    'subsample': 0.5902941839375372,
    'criterion': 'friedman_mse',
    'min_samples_split': 20,
    'min_samples_leaf': 10,
    'min_weight_fraction_leaf': 0.0175323040448155,
    'max_depth': 10,
    'min_impurity_decrease': 0.06678068552552889,
    'init': 'zero',
    'random_state': 2022,
    'max_features': 'sqrt',
    'alpha': 0.23585140032470903,
    'verbose': 2,
    'max_leaf_nodes': 15,
    'warm_start': False,
    'validation_fraction': 0.21488662066191244,
    'n_iter_no_change': 8,
    'tol': 0.0005401308587534491,
    'ccp_alpha': 0.00151895861887127
}

et_params = {
    'n_estimators': 531,
    'criterion': 'friedman_mse',
    'max_depth': 20,
    'min_samples_split': 3,
    'min_samples_leaf': 20,
    'min_weight_fraction_leaf': 0.09315416304656772,
    'max_features': 'sqrt',
    'max_leaf_nodes': 88,
    'min_impurity_decrease': 0.41028883629550056,
    'bootstrap': True,
    'oob_score': True,
    'n_jobs': -1,
    'random_state': 2022,
    'verbose': 1,
    'warm_start': True,
    'ccp_alpha': 0.0023371871861925567,
    'max_samples': 0.13295442632197021
}
"""

In [None]:
"""
catboost_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'iterations': [500, 1000, 1500],
    'random_strength': [0.1, 0.5, 1],
    'bagging_temperature': [0.5, 1, 1.5],
    'border_count': [32, 64, 128],
    'subsample': [0.5, 0.8, 1],
    'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'],
    'loss_function': ['RMSE', 'MAE', 'Quantile:alpha=0.95']
}

bayesian_ridge_param_grid = {
    'n_iter': [100, 300, 500],
    'tol': [1e-3, 1e-4, 1e-5],
    'alpha_1': [1e-6, 1e-7, 1e-8],
    'alpha_2': [1e-6, 1e-7, 1e-8],
    'lambda_1': [1e-6, 1e-7, 1e-8],
    'lambda_2': [1e-6, 1e-7, 1e-8]
}

lgbm_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 63, 127],
    'max_depth': [4, 6, 8],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.5, 0.8, 1],
    'colsample_bytree': [0.5, 0.8, 1]
}
"""

gradient_boosting_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.5, 0.8, 1],
    'max_features': ['auto', 'sqrt', 'log2']
}

extra_trees_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [None]:
kf = KFold(n_splits=10)

ms = {
    "catboost": CatBoostRegressor(**catboost_params),
    "br": BayesianRidge(**br_params),
    "lightgbm": LGBMRegressor(**lightgbm_params),
}

for name, m in ms.items():
    m.fit(train_final, log_target)
    print(name + " trained")

"""
# Define rs
rs = {}

for name, m in ms.items():
    r = np.exp(np.sqrt(-cross_val_score(m, train_final, log_target, scoring='neg_mean_squared_error', cv=kf)))
    rs[name] = r

print(rs)

# Evaluate rs
for name, r in rs.items():
    print("----------\n" + name)
    print(np.mean(r))
    print(np.std(r))
"""

In [None]:
""" ^^^
[RESULTS; cv = 10]:
----------
catboost - (#1)
1.122340304394616
0.019942095909698854
----------
br - (#3)
1.1351015144106285
0.02565664199126963
----------
lightgbm - (#2)
1.1323401113019387
0.022377581009135113
"""

kf = KFold(n_splits=5)

models = [
    ('GradientBoosting', GradientBoostingRegressor(), gradient_boosting_param_grid),
    ('ExtraTrees', ExtraTreesRegressor(), extra_trees_param_grid)
]

results = {}

for name, model, param_grid in models:
    print(f"Running GridSearchCV for {name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(train_final, log_target)
    
    y_pred = np.exp(grid_search.predict(train_final))
    y_true = np.exp(log_target)
    
    rmse_score = np.sqrt(mean_squared_error(y_true, y_pred))
    
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    print(f"Best params for {name}: {best_params}")
    print(f"Best RMSE score for {name}: {best_score}")
    print(f"RMSE score on training data for {name}: {rmse_score}\n")

    rmse_scores = np.exp(np.sqrt(-cross_val_score(model, train_final, log_target, scoring='neg_mean_squared_error', cv=kf)))
    results[name] = rmse_scores

In [None]:
""" ^^^
[RESULTS; cv = 3]: 
Best params for GradientBoosting: {'learning_rate': 0.05, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300, 'subsample': 1}

Best RMSE score for GradientBoosting: -0.014732626317137123
RMSE score on training data for GradientBoosting: 11524.806061580774

Best RMSE score for ExtraTrees: -0.021956603868079138
RMSE score on training data for ExtraTrees: 1.1290949852996799e-08
"""

""" ^^^
[RESULTS; cv = 5]:
Best params for ExtraTrees: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Best RMSE score for GradientBoosting: -0.014181648421777618
RMSE score on training data for GradientBoosting: 12051.717705119197

Best RMSE score for ExtraTrees: -0.021397126017031944
RMSE score on training data for ExtraTrees: 1.1290949852996799e-08
"""

for name, scores in results.items():
    print(f"Model: {name}")
    print(f"RMSE scores: {scores}")
    print(f"Mean RMSE: {np.mean(scores)}")
    print(f"Standard Deviation of RMSE: {np.std(scores)}")

In [None]:
""" ^^^
[RESULTS; cv = 5]:
Model: GradientBoosting
RMSE scores: [1.12752062 1.15025893 1.14513066 1.12399715 1.13833037]
Mean RMSE: 1.1370475473530295
Standard Deviation of RMSE: 0.010025948679718477
"""

""" ^^^
Model: ExtraTrees
RMSE scores: [1.12584935 1.1645177  1.16647413 1.12917204 1.14595987]
Mean RMSE: 1.146394619350057
Standard Deviation of RMSE: 0.017032598649793843
"""

from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor

gb_params = {
    'learning_rate': 0.05,
    'max_depth': 4,
    'max_features': 'sqrt',
    'min_samples_leaf': 1,
    'min_samples_split': 5,
    'n_estimators': 300,
    'subsample': 1
}

et_params = {
    'max_depth': None,
    'max_features': 'sqrt',
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 300
}

best_gb_model = GradientBoostingRegressor(**gb_params)
best_et_model = ExtraTreesRegressor(**et_params)

best_gb_model.fit(train_final, log_target)
best_et_model.fit(train_final, log_target)

## Prediction(s) Submission(s)

In [None]:
final_predictions = (
    0.4 * np.exp(ms['catboost'].predict(test_final)) +
    0.2 * np.exp(ms['br'].predict(test_final)) +
    0.2 * np.exp(ms['lightgbm'].predict(test_final)) +
    0.1 * np.exp(best_gb_model.predict(test_final)) +  
    0.1 * np.exp(best_et_model.predict(test_final))    
)

final_predictions

In [None]:
submission = pd.concat([test_ids, pd.Series(final_predictions, name='SalePrice')], axis=1)

submission.to_csv('./submission.csv', index=False, header=True)

submission

In [None]:
"""
Submission #1. Score: 0.12312
Submission #2. Score: 0.12813
Submission #3. Score: 0.13826
Submission #4. Score: 0.12272 
Submission #5. Score: 0.12154 (BEST SCORE; LEADERBOARD POSITION: 282)
"""