# Data Preprocessing

In [None]:
# !pip install pycaret
# !pip install catboost xgboost optuna

# !pip install --force-reinstall threadpoolctl 
# ^ This WORKED in fixing the "knn_impute()" - related error

In [None]:
import numpy as np # Linear algebra
import pandas as pd # DataFrame manipulation
pd.options.display.max_columns = 500 
pd.options.display.max_rows = 10

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid') # This makes our plots pretty, but it's completely unnecessary 

from sklearn.neighbors import KNeighborsRegressor # KNeighborsRegressor is used for numeric imputation purposes
import scipy.stats
from sklearn.preprocessing import StandardScaler # StandardScaler is used for numeric scaling purposes
from pycaret.regression import setup, compare_models # Pycaret is a lovely "low/no-code" tool to aid us in our model selection(s)
from sklearn.model_selection import KFold, cross_val_score # cross_val_score measures a model's performance

from catboost import CatBoostRegressor
from sklearn.linear_model import BayesianRidge, HuberRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor

import optuna # Optuna is a quicker alternative to GridSearchCV; an impatient person's dream!

In [None]:
train0 = pd.read_csv('/Users/joshuaconde/GitHub-REPOSITORIES/Kaggle-HousePricePredictions/HousePricePredictions/train.csv')
test0 = pd.read_csv('/Users/joshuaconde/GitHub-REPOSITORIES/Kaggle-HousePricePredictions/HousePricePredictions/test.csv')

sample_submission = pd.read_csv('/Users/joshuaconde/GitHub-REPOSITORIES/Kaggle-HousePricePredictions/HousePricePredictions/sample_submission.csv')

In [None]:
# train0

In [None]:
# I understand test0's not having the target variable as its final column, but why is there (in addition to this) one less row?

# test0

In [None]:
# Due to SalePrice's continuous nature, we need to derive (from memory) some kind of a regression model

# sample_submission

In [None]:
###
# DATA PREPROCESSING (CLEANING) PIPELINE:
###
# NUMERIC IMPUTATION (By means of ".mean()" and/or the "KNN" strategy) 
# CATEGORICAL IMPUTATION (Ordinal(s) = "N;" Regular Categorical(s) = ".mode()")

# NUMERIC SCALING ("StandardScaler()")
# CATEGORICAL ENCODING ("One hot encoding")

# FEATURE TRANSFORMATION (".log1p()")
# TARGET TRANSFORMATION (".log()")
# ^ Don't forget about the finishing ".exp()"

# FEATURE SELECTION - There's no need to perform this step unless the column count exceeds ~1k
# FEATURE ENGINEERING - We'll perform this following our testing the performance of at least 1 model

## 1. DataFrame Combination

In [None]:
# This early on, in the Data Preprocessing stage, it's totally valid to combine the training AND testing sets
# It's crucial, however, that we (one again) separate them come time for any model ".fit()"ing whatsoever

target = train0['SalePrice']
test_ids = test0['Id'] # This is only needed for our soon-to-come submission

train1 = train0.drop(['Id', 'SalePrice'], axis=1) 
# ^ We're to run a .drop() (on the training data) to effectively erase any unique Id's and target variable(s) at play
test1 = test0.drop('Id', axis=1)

# "data," here, pertains to the now-COMBINED DataFrame
data0 = pd.concat([train1, test1], axis=0).reset_index(drop=True)

data0

## 2. Data Cleaning

In [None]:
# We'll, for goals related to consistency, re-initialize a copy of our data set every time the 
# number in bold is incremented

data1 = data0.copy()

### 2A. Ensure Proper Data Types

In [None]:
data1['MSSubClass'] = data1['MSSubClass'].astype(str) # This feature is totally nominal

### 2B. Fill Ordinal AND Regular Categorical Missing Values

In [None]:
# Ordinal imputations can be done via. through our careful use of some unique, constant value
for column in [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
]: data1[column] = data1[column].fillna("N")

# Regular categoricals can be imputed with the popular ".mode()"
for column in [
    'MSZoning',
    'Utilities',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Electrical',
    'KitchenQual',
    'Functional',
    'SaleType'
]: data1[column] = data1[column].fillna(data1[column].mode()[0])

### 2C. Fill Numeric Missing Values

In [None]:
def knn_impute(df, na_target):
    df = df.copy()
    
    numeric_df = df.select_dtypes(np.number) # Selects all numeric columns
    non_na_columns = numeric_df.loc[ : , numeric_df.isna().sum() == 0].columns # Selects all numeric columns that have NO empty values
    
    X_train = numeric_df.loc[numeric_df[na_target].isna() == False, non_na_columns]
    # ^ Selects the feature columns (non_na_columns) for rows where the target column does not have missing values
    y_train = numeric_df.loc[numeric_df[na_target].isna() == False, na_target]
    # ^ Selects the values of the target column (na_target) for rows where it does not have missing values

    # "X_train" contains the features and "y_train" contains the corresponding target values for rows where the target column 
    # (na_target) does not have missing values
    
    X_test = numeric_df.loc[numeric_df[na_target].isna() == True, non_na_columns]
    
    knn = KNeighborsRegressor()
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    df.loc[df[na_target].isna() == True, na_target] = y_pred
    
    return df

In [None]:
for column in [
    'LotFrontage',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea'
]: data1 = knn_impute(data1, column)

In [None]:
data1.isna().sum()

## 3. Feature Engineering

In [None]:
data2 = data1.copy()

data2["SqFtPerRoom"] = ( data2["GrLivArea"] / (data2["TotRmsAbvGrd"] +
                         data2["FullBath"] +
                         data2["HalfBath"] +
                         data2["KitchenAbvGr"]) )

data2['Total_Home_Quality'] = data1['OverallQual'] + data1['OverallCond']

data2['Total_Bathrooms'] = (data2['FullBath'] + (0.5 * data2['HalfBath']) +
                            data2['BsmtFullBath'] + (0.5 * data2['BsmtHalfBath']))

data2["HighQualSF"] = data2["1stFlrSF"] + data2["2ndFlrSF"]

data2.columns

## 4. Feature Transformations -> .log1p()

### 4A. Log Transform(ation) for Skewed Features

In [None]:
data3 = data2.copy()

skewed_df = pd.DataFrame(data3.select_dtypes(np.number).columns, columns=['Feature'])
skewed_df['Skew'] = skewed_df['Feature'].apply(lambda feature: scipy.stats.skew(data3[feature]))
skewed_df['Absolute Skew'] = skewed_df['Skew'].apply(abs)
skewed_df['Skewed'] = skewed_df['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False) # 0.5 is a pretty regular  
                                                                                        # "cut-off" as far as this context goes

skewed_df

In [None]:
for column in skewed_df.query("Skewed == True")['Feature'].values:
    data3[column] = np.log1p(data3[column])

### 4B. Cosine Transform(ation) for Cyclical Features

In [None]:
data2['MoSold'] = (-np.cos(0.5236 * data3['MoSold']))

data2['MoSold']

## 5. Categorical Encoding

In [None]:
data4 = data3.copy()

data4 = pd.get_dummies(data4)

data4 

## 6. Numeric Scaling

In [None]:
data5 = data4.copy()

scaler = StandardScaler()
scaler.fit(data5)

data5 = pd.DataFrame(scaler.transform(data5), index=data5.index, columns=data5.columns)

data5

## 7. Target Transformation -> .log()

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
sns.distplot(target, kde=True, fit=scipy.stats.norm)
plt.title("Without Log Transform")

plt.subplot(1, 2, 2)
sns.distplot(np.log(target), kde=True, fit=scipy.stats.norm)
plt.xlabel("Log SalePrice")
plt.title("With Log Transform")

plt.show()

In [None]:
log_target = np.log(target)

## 8. Data Splitting

In [None]:
# I like to consider it strictly ILLEGAL to ever ".fit()" some model on a combined train(ing)/test(ing) set

train_final = data5.loc[:train0.index.max(), :].copy()
test_final = data5.loc[train0.index.max() + 1:, :].reset_index(drop=True).copy()

In [None]:
train_final

In [None]:
test_final

# Pycaret's Model Selection

In [None]:
# _ = setup(data=pd.concat([train_final, log_target], axis=1), target='SalePrice')

In [None]:
# compare_models()

In [None]:
""" 
RESULTS:
1. catboost regressor (SAME)
2. gradient boosting regressor
3. light gradient boosting machine (SAME)
4. bayesian ridge (SAME)
5. extra trees regressor
-
6. extreme gradient boosting
7. random forest regressor
"""

## DELETED: Baseline Model

In [None]:
# baseline_model = CatBoostRegressor(verbose=0)

# baseline_model.fit(train_final, log_target)

## DELETED: Evaluate Baseline Model

In [None]:
kf = KFold(n_splits=10) # -> 9/10 = Train(ing); 1/10 = Test(ing)

# results = cross_val_score(baseline_model, train_final, log_target, scoring="neg_mean_squared_error", cv=kf) # estimator, X, target

# -results

In [None]:
# np.exp(np.sqrt(np.mean(-results))) # this ".exp()" is here, thanks, to the feature's prior leveraging of ".log(1p)()"

In [None]:
# target.describe()

In [None]:
# log_target.describe()

## Bayesian Ridge Hyper-parameter Optimization 

In [None]:
"""
def br_objective(trial):
    n_iter = trial.suggest_int('n_iter', 50, 600)
    tol = trial.suggest_loguniform('tol', 1e-8, 10.0)
    alpha_1 = trial.suggest_loguniform('alpha_1', 1e-8, 10.0)
    alpha_2 = trial.suggest_loguniform('alpha_2', 1e-8, 10.0)
    lambda_1 = trial.suggest_loguniform('lambda_1', 1e-8, 10.0)
    lambda_2 = trial.suggest_loguniform('lambda_2', 1e-8, 10.0)
    alpha_init = trial.suggest_loguniform('alpha_init', 1e-8, 10.0)
    lambda_init = trial.suggest_loguniform('lambda_init', 1e-8, 10.0)
    compute_score = trial.suggest_categorical('compute_score', ['True', 'False'])
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    copy_X = trial.suggest_categorical('copy_X', [True, False])
    verbose = trial.suggest_categorical('verbose', [True, False])
    normalize = trial.suggest_categorical('normalize', [True, False])

    model = BayesianRidge(
        n_iter=n_iter,
        tol=tol,
        alpha_1=alpha_1,
        alpha_2=alpha_2,
        lambda_1=lambda_1,
        lambda_2=lambda_2,
        alpha_init=alpha_init,
        lambda_init=lambda_init,
        compute_score=compute_score,
        fit_intercept=fit_intercept,
        copy_X=copy_X,
        verbose=verbose,
        normalize=normalize
    )

    model.fit(train_final, log_target)

    cv_scores = np.exp(np.sqrt(-cross_val_score(model, train_final, log_target, scoring='neg_mean_squared_error', cv=kf)))
    # this ".exp()" is here, thanks, to the feature's prior leveraging of ".log(1p)()"

    return np.mean(cv_scores)
"""

In [None]:
#study = optuna.create_study(direction='minimize')
#study.optimize(br_objective, n_trials=100)

In [None]:
#study.best_params

## Gradient Boosting Regressor Hyper-parameter Optimization

In [None]:
"""
def gbr_objective(trial):
    loss = trial.suggest_categorical('loss', ['squared_error', 'absolute_error', 'huber', 'quantile'])
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1.0)
    n_estimators = trial.suggest_int('n_estimators', 50, 600)
    subsample = trial.suggest_uniform('subsample', 0.1, 1.0)
    criterion = trial.suggest_categorical('criterion', ['friedman_mse', 'squared_error'])
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    min_weight_fraction_leaf = trial.suggest_uniform('min_weight_fraction_leaf', 0.0, 0.5)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_impurity_decrease = trial.suggest_uniform('min_impurity_decrease', 0.0, 0.5)
    init = trial.suggest_categorical('init', [None, 'zero'])
    random_state = trial.suggest_categorical('random_state', [None, 42, 2022])
    max_features = trial.suggest_categorical('max_features', [None, 'sqrt', 'log2'])
    alpha = trial.suggest_uniform('alpha', 0.0, 0.99)
    verbose = trial.suggest_categorical('verbose', [0, 1, 2])
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 10, 100)
    warm_start = trial.suggest_categorical('warm_start', [True, False])
    validation_fraction = trial.suggest_uniform('validation_fraction', 0.1, 0.5)
    n_iter_no_change = trial.suggest_int('n_iter_no_change', 5, 20)
    tol = trial.suggest_loguniform('tol', 1e-5, 1e-2)
    ccp_alpha = trial.suggest_uniform('ccp_alpha', 0.0, 0.5)

    model = GradientBoostingRegressor(
        loss=loss,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        subsample=subsample,
        criterion=criterion,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_depth=max_depth,
        min_impurity_decrease=min_impurity_decrease,
        init=init,
        random_state=random_state,
        max_features=max_features,
        alpha=alpha,
        verbose=verbose,
        max_leaf_nodes=max_leaf_nodes,
        warm_start=warm_start,
        validation_fraction=validation_fraction,
        n_iter_no_change=n_iter_no_change,
        tol=tol,
        ccp_alpha=ccp_alpha
    )

    model.fit(train_final, log_target)

    cv_scores = np.exp(np.sqrt(-cross_val_score(model, train_final, log_target, scoring='neg_mean_squared_error', cv=kf)))

    return np.mean(cv_scores)
"""

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(gbr_objective, n_trials=100)

In [None]:
# study.best_params

## Extra Trees Regressor Hyper-parameter Optimization

In [None]:
"""
def et_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 600)
    criterion = trial.suggest_categorical('criterion', ['poisson', 'absolute_error', 'squared_error', 'friedman_mse'])
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    min_weight_fraction_leaf = trial.suggest_uniform('min_weight_fraction_leaf', 0.0, 0.5)
    max_features = trial.suggest_categorical('max_features', [None, 'sqrt', 'log2'])
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 10, 100)
    min_impurity_decrease = trial.suggest_uniform('min_impurity_decrease', 0.0, 0.5)
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    oob_score = trial.suggest_categorical('oob_score', [True, False]) if bootstrap else False
    n_jobs = trial.suggest_categorical('n_jobs', [-1, 1, 2, 4])
    random_state = trial.suggest_categorical('random_state', [None, 42, 2022])
    verbose = trial.suggest_categorical('verbose', [0, 1, 2])
    warm_start = trial.suggest_categorical('warm_start', [True, False])
    ccp_alpha = trial.suggest_uniform('ccp_alpha', 0.0, 0.5)
    max_samples = trial.suggest_uniform('max_samples', 0.1, 1.0) if bootstrap else None
    # monotonic_cst = trial.suggest_categorical('monotonic_cst', [None, 'increasing', 'decreasing'])

    model = ExtraTreesRegressor(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        bootstrap=bootstrap,
        oob_score=oob_score,
        n_jobs=n_jobs,
        random_state=random_state,
        verbose=verbose,
        warm_start=warm_start,
        ccp_alpha=ccp_alpha,
        max_samples=max_samples,
        # monotonic_cst=monotonic_cst
    )

    model.fit(train_final, log_target)

    cv_scores = np.exp(np.sqrt(-cross_val_score(model, train_final, log_target, scoring='neg_mean_squared_error', cv=kf)))

    return np.mean(cv_scores)
"""

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(et_objective, n_trials=100)

In [None]:
# study.best_params

## Bagging Ensemble

In [None]:
# Optuna allegedly defines for us the ideal(?) hyper-parameters, shown below

catboost_params = {
    'iterations': 6000,
    'learning_rate': 0.005,
    'depth': 4,
    'l2_leaf_reg': 1,
    'eval_metric':'RMSE',
    'early_stopping_rounds': 200,
    'random_seed': 42
}

br_params = {
    'n_iter': 304,
    'tol': 0.16864712769300896,
    'alpha_1': 5.589616542154059e-07,
    'alpha_2': 9.799343618469923,
    'lambda_1': 1.7735725582463822,
    'lambda_2': 3.616928181181732e-06
}

lightgbm_params = {
    'num_leaves': 39,
    'max_depth': 2,
    'learning_rate': 0.13705339989856127,
    'n_estimators': 273,
}

gbr_params = {
    'loss': 'absolute_error',
    'learning_rate': 0.11054616145386358,
    'n_estimators': 266,
    'subsample': 0.5902941839375372,
    'criterion': 'friedman_mse',
    'min_samples_split': 20,
    'min_samples_leaf': 10,
    'min_weight_fraction_leaf': 0.0175323040448155,
    'max_depth': 10,
    'min_impurity_decrease': 0.06678068552552889,
    'init': 'zero',
    'random_state': 2022,
    'max_features': 'sqrt',
    'alpha': 0.23585140032470903,
    'verbose': 2,
    'max_leaf_nodes': 15,
    'warm_start': False,
    'validation_fraction': 0.21488662066191244,
    'n_iter_no_change': 8,
    'tol': 0.0005401308587534491,
    'ccp_alpha': 0.00151895861887127
}

et_params = {
    'n_estimators': 531,
    'criterion': 'friedman_mse',
    'max_depth': 20,
    'min_samples_split': 3,
    'min_samples_leaf': 20,
    'min_weight_fraction_leaf': 0.09315416304656772,
    'max_features': 'sqrt',
    'max_leaf_nodes': 88,
    'min_impurity_decrease': 0.41028883629550056,
    'bootstrap': True,
    'oob_score': True,
    'n_jobs': -1,
    'random_state': 2022,
    'verbose': 1,
    'warm_start': True,
    'ccp_alpha': 0.0023371871861925567,
    'max_samples': 0.13295442632197021
}

In [None]:
models = {
    "catboost": CatBoostRegressor(**catboost_params, verbose=0),
    "br": BayesianRidge(**br_params),
    "lightgbm": LGBMRegressor(**lightgbm_params),
    "gbr": GradientBoostingRegressor(**gbr_params),
    "et": ExtraTreesRegressor(**et_params)
}

In [None]:
for name, model in models.items():
    model.fit(train_final, log_target)
    print(name + " trained")

## Evaluate Models

In [None]:
results = {}

for name, model in models.items():
    result = np.exp(np.sqrt(-cross_val_score(model, train_final, log_target, scoring='neg_mean_squared_error', cv=kf)))
    results[name] = result

results

In [None]:
for name, result in results.items():
    print("----------\n" + name)
    print(np.mean(result))
    print(np.std(result))

## Combine Predictions

In [None]:
final_predictions = (
    0.2 * np.exp(models['catboost'].predict(test_final)) +
    0.2 * np.exp(models['br'].predict(test_final)) +
    0.2 * np.exp(models['lightgbm'].predict(test_final)) +
    0.2 * np.exp(models['gbr'].predict(test_final)) +
    0.2 * np.exp(models['et'].predict(test_final))
)

final_predictions

## Prediction(s) Submission(s)

In [None]:
submission = pd.concat([test_ids, pd.Series(final_predictions, name='SalePrice')], axis=1)

submission.to_csv('./submission.csv', index=False, header=True)

submission

In [None]:
# 1. Score: 0.12312
# 2. Score: 0.12813
# 3. Score: 0.13826