# Imports and Exploration

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

import optuna
import xgboost as xgb
from catboost import CatBoostRegressor
import lightgbm as lgb

import pickle

import warnings
warnings.filterwarnings('ignore')

In [44]:
data = pd.read_csv("../../extracts/raw/train.csv").set_index('Id')

selected_columns = ['MSSubClass', 'OverallQual', 'GrLivArea', 'GarageCars', 
                    'BldgType', 'YearBuilt', 'TotalBsmtSF', 'Fireplaces',
                    'LotArea', 'PoolQC', 'SalePrice']

data = data[selected_columns]
data.head()

Unnamed: 0_level_0,MSSubClass,OverallQual,GrLivArea,GarageCars,BldgType,YearBuilt,TotalBsmtSF,Fireplaces,LotArea,PoolQC,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,60,7,1710,2,1Fam,2003,856,0,8450,,208500
2,20,6,1262,2,1Fam,1976,1262,1,9600,,181500
3,60,7,1786,2,1Fam,2001,920,1,11250,,223500
4,70,7,1717,3,1Fam,1915,756,1,9550,,140000
5,60,8,2198,3,1Fam,2000,1145,1,14260,,250000


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   MSSubClass   1460 non-null   int64 
 1   OverallQual  1460 non-null   int64 
 2   GrLivArea    1460 non-null   int64 
 3   GarageCars   1460 non-null   int64 
 4   BldgType     1460 non-null   object
 5   YearBuilt    1460 non-null   int64 
 6   TotalBsmtSF  1460 non-null   int64 
 7   Fireplaces   1460 non-null   int64 
 8   LotArea      1460 non-null   int64 
 9   PoolQC       7 non-null      object
 10  SalePrice    1460 non-null   int64 
dtypes: int64(9), object(2)
memory usage: 136.9+ KB


## Handling NaN values

In [46]:
data.isna().sum()

MSSubClass        0
OverallQual       0
GrLivArea         0
GarageCars        0
BldgType          0
YearBuilt         0
TotalBsmtSF       0
Fireplaces        0
LotArea           0
PoolQC         1453
SalePrice         0
dtype: int64

In [47]:
data['PoolQC'].fillna('None', inplace=True)
data.isna().sum()

MSSubClass     0
OverallQual    0
GrLivArea      0
GarageCars     0
BldgType       0
YearBuilt      0
TotalBsmtSF    0
Fireplaces     0
LotArea        0
PoolQC         0
SalePrice      0
dtype: int64

In [48]:
data['BldgType'].value_counts()

BldgType
1Fam      1220
TwnhsE     114
Duplex      52
Twnhs       43
2fmCon      31
Name: count, dtype: int64

In [49]:
data['PoolQC'].value_counts()

PoolQC
None    1453
Gd         3
Ex         2
Fa         2
Name: count, dtype: int64

In [50]:
print("Unique Values....")
for col in data.columns:
    print(f'{col} ====> {data[col].nunique()}')

Unique Values....
MSSubClass ====> 15
OverallQual ====> 10
GrLivArea ====> 861
GarageCars ====> 5
BldgType ====> 5
YearBuilt ====> 112
TotalBsmtSF ====> 721
Fireplaces ====> 4
LotArea ====> 1073
PoolQC ====> 4
SalePrice ====> 663


## Train Test Split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1), data['SalePrice'],
                                                    test_size=0.2)

## Label Encoding and Standard Scaling

In [62]:
numerical_continous_cols = ['GrLivArea', 'TotalBsmtSF', 'LotArea']
object_cols = ['BldgType', 'PoolQC']

In [63]:
for col in object_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])
    print(f"{col} ====> {le.classes_}")

BldgType ====> ['1Fam' '2fmCon' 'Duplex' 'Twnhs' 'TwnhsE']
PoolQC ====> ['Ex' 'Fa' 'Gd' 'None']


In [13]:
scaler = StandardScaler()
X_train[numerical_continous_cols] = scaler.fit_transform(X_train[numerical_continous_cols])
X_test[numerical_continous_cols] = scaler.fit_transform(X_test[numerical_continous_cols])

print(scaler.mean_, scaler.var_)

[1488.55821918 1043.85273973 9769.50342466] [  280607.1438708    183964.33790345 29927006.05820744]


In [14]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,OverallQual,GrLivArea,GarageCars,BldgType,YearBuilt,TotalBsmtSF,Fireplaces,LotArea,PoolQC
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1395,120,7,-0.04235,3,4,2006,0.669535,1,-0.616145,3
741,70,5,-0.172127,2,0,1910,-1.807398,0,-0.102125,3
74,20,5,-0.832467,2,0,1954,0.057106,0,-0.046605,3
972,160,7,0.198121,2,3,2003,-0.673271,0,-0.780576,3
222,60,6,1.421466,2,0,2002,-0.115281,1,-0.243885,3


In [15]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import pickle


models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Regression': SVR(),
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor(verbose=0),
    'CatBoost': CatBoostRegressor(silent=True),
}

results_list = []

def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)  
    mae = mean_absolute_error(y_test, y_pred)
    n = len(y_test)
    p = X_test.shape[1]  

    r2_adj = adjusted_r2(r2, n, p)

    results_list.append({
        'Model': model_name,
        'R^2': r2,
        'Adjusted R^2': r2_adj,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae
    })

results_df = pd.DataFrame(results_list)

results_df

Unnamed: 0,Model,R^2,Adjusted R^2,MSE,RMSE,MAE
0,Linear Regression,0.788356,0.780825,1462392000.0,38241.235052,23890.628136
1,Ridge Regression,0.788384,0.780853,1462201000.0,38238.732271,23887.399496
2,Lasso Regression,0.788355,0.780823,1462402000.0,38241.370064,23890.489635
3,Random Forest,0.803864,0.796884,1355238000.0,36813.556524,20371.042286
4,Gradient Boosting,0.809408,0.802625,1316933000.0,36289.568941,19588.43965
5,Support Vector Regression,-0.049313,-0.086655,7250430000.0,85149.457142,59710.532172
6,XGBoost,0.796676,0.789441,1404904000.0,37482.043774,22232.03731
7,LightGBM,0.825122,0.818898,1208356000.0,34761.415094,20266.311419
8,CatBoost,0.842323,0.836712,1089497000.0,33007.523292,18785.003922


## Optuna for CatBoost

In [18]:
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 1e-1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'verbose': False  
    }
    
    model = CatBoostRegressor(**params, eval_metric='RMSE')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    adj_r2 = adjusted_r2(r2, len(y_test), X_test.shape[1])

    print(f"Trial RMSE: {rmse}, R2: {r2}, Adjusted R2: {adj_r2}, MAE: {mae}")
    return rmse

study = optuna.create_study(direction='minimize')  # RMSE is minimized

study.optimize(objective, n_trials=200, show_progress_bar=True)

print("Best hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2024-10-14 14:32:54,582] A new study created in memory with name: no-name-b0f589bc-7d87-43c5-bc52-06b7fbdc22fc


  0%|          | 0/200 [00:00<?, ?it/s]

Trial RMSE: 32757.99438460459, R2: 0.8446984498774963, Adjusted R2: 0.8391717043215353, MAE: 19381.083897053075
[I 2024-10-14 14:32:55,188] Trial 0 finished with value: 32757.99438460459 and parameters: {'iterations': 331, 'depth': 6, 'learning_rate': 0.013429113728766066, 'l2_leaf_reg': 1.7914614696009115e-05, 'bagging_temperature': 0.5288571853402873, 'random_strength': 0.651733958062307, 'border_count': 104}. Best is trial 0 with value: 32757.99438460459.
Trial RMSE: 55516.39537790925, R2: 0.5539498369151838, Adjusted R2: 0.5380761656310267, MAE: 40059.20480803664
[I 2024-10-14 14:32:56,509] Trial 1 finished with value: 55516.39537790925 and parameters: {'iterations': 229, 'depth': 9, 'learning_rate': 0.0024757712428806955, 'l2_leaf_reg': 0.0010048759393603062, 'bagging_temperature': 0.15540152205724633, 'random_strength': 0.4698400665364314, 'border_count': 137}. Best is trial 0 with value: 32757.99438460459.
Trial RMSE: 33122.77085173372, R2: 0.8412204734812555, Adjusted R2: 0.835

## Optuna for XGBoost

In [19]:
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),
        'lambda': trial.suggest_loguniform('lambda', 1e-5, 1e-1),
        'random_state': 42,
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse'
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    adj_r2 = adjusted_r2(r2, len(y_test), X_test.shape[1])

    print(f"Trial RMSE: {rmse}, R2: {r2}, Adjusted R2: {adj_r2}, MAE: {mae}")
    return rmse

study = optuna.create_study(direction='minimize')  # RMSE is minimized

study.optimize(objective, n_trials=200, show_progress_bar=True)

print("Best hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)


[I 2024-10-14 14:37:33,529] A new study created in memory with name: no-name-d7a315b9-aecf-4a4f-a492-53f1199e7494


  0%|          | 0/200 [00:00<?, ?it/s]

Trial RMSE: 38173.09015772206, R2: 0.7891100645065308, Adjusted R2: 0.7816050845957312, MAE: 19965.152223351884
[I 2024-10-14 14:37:34,987] Trial 0 finished with value: 38173.09015772206 and parameters: {'n_estimators': 832, 'max_depth': 7, 'learning_rate': 0.007816826239373943, 'colsample_bytree': 0.6835141316263169, 'subsample': 0.6260139312494548, 'alpha': 1.242888663835556e-05, 'lambda': 0.008058438745840618}. Best is trial 0 with value: 38173.09015772206.
Trial RMSE: 37720.34383695465, R2: 0.7940828800201416, Adjusted R2: 0.786754868632958, MAE: 21027.030420590752
[I 2024-10-14 14:37:36,058] Trial 1 finished with value: 37720.34383695465 and parameters: {'n_estimators': 542, 'max_depth': 7, 'learning_rate': 0.004027653134979593, 'colsample_bytree': 0.6164343765214741, 'subsample': 0.9501905504366712, 'alpha': 0.00048706219927365104, 'lambda': 0.004451156742973349}. Best is trial 1 with value: 37720.34383695465.
Trial RMSE: 47077.240440725276, R2: 0.679252564907074, Adjusted R2: 0.

## Optuna for LightGBM

In [17]:
import optuna
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - p - 1) / (n - 1)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-5, 1e-1),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-5, 1e-1),
        'random_state': 42,  
        'metric': 'rmse',
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'verbose': -1  
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    adj_r2 = adjusted_r2(r2, len(y_test), X_test.shape[1])

    print(f"Trial RMSE: {rmse}, R2: {r2}, Adjusted R2: {adj_r2}, MAE: {mae}")
    return rmse

study = optuna.create_study(direction='minimize')  # RMSE is minimized
study.optimize(objective, n_trials=300, show_progress_bar=True)

print("Best hyperparameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2024-10-14 14:31:13,085] A new study created in memory with name: no-name-fb434bcd-de3c-4bb1-85de-76e179a5143e


  0%|          | 0/300 [00:00<?, ?it/s]

Trial RMSE: 34334.394076679535, R2: 0.8293917778572745, Adjusted R2: 0.8352546033604609, MAE: 19475.558077716167
[I 2024-10-14 14:31:13,524] Trial 0 finished with value: 34334.394076679535 and parameters: {'n_estimators': 479, 'max_depth': 9, 'learning_rate': 0.020720648177621384, 'num_leaves': 27, 'feature_fraction': 0.5372923314845762, 'bagging_fraction': 0.6688363052404774, 'lambda_l1': 0.03687336490383464, 'lambda_l2': 0.0017088936770895621}. Best is trial 0 with value: 34334.394076679535.
Trial RMSE: 35241.46098189171, R2: 0.8202582395107073, Adjusted R2: 0.8264349323110266, MAE: 21542.99840922937
[I 2024-10-14 14:31:13,976] Trial 1 finished with value: 35241.46098189171 and parameters: {'n_estimators': 869, 'max_depth': 10, 'learning_rate': 0.07248352057779874, 'num_leaves': 37, 'feature_fraction': 0.9757103318346138, 'bagging_fraction': 0.6938071846583317, 'lambda_l1': 0.00038540702548113596, 'lambda_l2': 1.2998769460591099e-05}. Best is trial 0 with value: 34334.394076679535.
T

# Conclusion from Optuna

In [22]:
xgb_params = {'n_estimators': 240, 'max_depth': 5, 'learning_rate': 0.08030782869869577, 
              'colsample_bytree': 0.6148908352692782, 'subsample': 0.6711125958498432, 
              'alpha': 0.0012000540026731172, 'lambda': 2.3895400723627096e-05}


cat_params = {'iterations': 498, 'depth': 6, 'learning_rate': 0.03863680881477325, 
              'l2_leaf_reg': 0.007248118937221678, 'bagging_temperature': 0.18895058761442268, 
              'random_strength': 0.050188845180987596, 'border_count': 223, 'verbose':False}


lgb_params = {'n_estimators': 924, 'max_depth': 4, 'learning_rate': 0.09291685685329489, 
              'num_leaves': 78, 'feature_fraction': 0.959770133644152, 'bagging_fraction': 0.522901977120909, 
              'lambda_l1': 3.1601367255069994e-05, 'lambda_l2': 1.1861832195919464e-05, 'verbose':-1}

In [24]:
xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(X_train, y_train, verbose=0)
print("Done XGB!")
cat_model = CatBoostRegressor(**cat_params)
cat_model.fit(X_train, y_train, verbose=0)
print("Done CAT!")
lgb_model = lgb.LGBMRegressor(**lgb_params)
lgb_model.fit(X_train, y_train)
print("Done LGBM!")

Done XGB!
Done CAT!
Done LGBM!


In [28]:
print("****_Mean Absolute Errors after OPTUNA_****\n")

xgb_preds = xgb_model.predict(X_test)
cat_preds = cat_model.predict(X_test)
lgb_preds = lgb_model.predict(X_test)

print(f"XGBoost Optuna Model : {mean_absolute_error(y_test, xgb_preds)}")
print(f"CatBoost Optuna Model : {mean_absolute_error(y_test, cat_preds)}")
print(f"LightGBM Optuna Model : {mean_absolute_error(y_test, lgb_preds)}")

print("****_R2 Scores after OPTUNA_****\n")

print(f"XGBoost Optuna Model : {r2_score(y_test, xgb_preds)}")
print(f"CatBoost Optuna Model : {r2_score(y_test, cat_preds)}")
print(f"LightGBM Optuna Model : {r2_score(y_test, lgb_preds)}")

****_Mean Absolute Errors after OPTUNA_****

XGBoost Optuna Model : 20714.080840646406
CatBoost Optuna Model : 18458.503447713963
LightGBM Optuna Model : 20063.658566667524
****_R2 Scores after OPTUNA_****

XGBoost Optuna Model : 0.7955193519592285
CatBoost Optuna Model : 0.8734246962987209
LightGBM Optuna Model : 0.8525740325194261


In [30]:
# CatBoost Performs best. So would deploy that on Streamlit app.

with open('catboost_optuna.pkl', 'wb') as file:
    pickle.dump(cat_model, file)

print("Dumped Pickle!")

Dumped Pickle!
