<div style="text-align: center; background-color: #0A6EBD; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
  Kaggle Competitions: House Prices - Advanced Regression Techniques  @FIT-HCMUS, VNU-HCM 📌
</div>

<div style="text-align: center; background-color: #b1d1ff; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
  Stage 03 - Implement model 📌
</div>

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

import warnings
warnings.filterwarnings('ignore')

## Load the dataset

In [4]:
df_train = pd.read_csv('../data/processed/train_processed.csv')
df_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,No,Reg,Lvl,AllPub,Inside,...,0,No,No,No,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,No,Reg,Lvl,AllPub,FR2,...,0,No,No,No,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,No,IR1,Lvl,AllPub,Inside,...,0,No,No,No,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,No,IR1,Lvl,AllPub,Corner,...,0,No,No,No,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,No,IR1,Lvl,AllPub,FR2,...,0,No,No,No,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1400,20,FV,62.0,7500,Pave,Pave,Reg,Lvl,AllPub,Inside,...,0,No,No,No,0,10,2009,WD,Normal,185000
1401,60,RL,62.0,7917,Pave,No,Reg,Lvl,AllPub,Inside,...,0,No,No,No,0,8,2007,WD,Normal,175000
1402,20,RL,85.0,13175,Pave,No,Reg,Lvl,AllPub,Inside,...,0,No,MnPrv,No,0,2,2010,WD,Normal,210000
1403,70,RL,66.0,9042,Pave,No,Reg,Lvl,AllPub,Inside,...,0,No,GdPrv,Shed,2500,5,2010,WD,Normal,266500


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
    
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse = False))
])

In [6]:
# Update categorical and numerical columns
categorical_columns = df_train.select_dtypes(include=['object', 'category']).columns
numerical_columns = df_train.select_dtypes(include=['int64', 'float64']).columns

# Remove target variable from numerical columns
numerical_columns = numerical_columns.drop('SalePrice')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)])

# Apply the pipeline to your dataset
X = df_train.drop('SalePrice', axis=1)
y = np.log(df_train['SalePrice']) #normalize dependent variable 
X_preprocessed = pipeline.fit_transform(X)

In [7]:
# import sys
# !{sys.executable} -m pip install xgboost

In [24]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.ensemble import BaggingRegressor
from sklearn.svm import SVR

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Define the models
models = {
    #'LinearRegression': LinearRegression(),
    #'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    #'BaggingRegression': BaggingRegressor(random_state=0),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state = 42)
}

# Define the hyperparameter grids for each model
param_grids = {
    #'LinearRegression': {},
    # 'RandomForest': {
    #     'n_estimators': [100, 200, 500],
    #     'max_depth': [None, 10, 30],
    #     'min_samples_split': [2, 5, 10],
    # },

    # Dạng Boosting state-of-the-art
    'XGBoost': {
        'n_estimators': [100, 300],
        'learning_rate': [0.1, 0.3],
        'max_depth': [6, 9],
    },
    
    # Bagging 0.36502402745818685 khá mid
    # 'BaggingRegression': {
    #     'n_estimators': [10, 20, 30],
    #     'max_samples': [1, 2, 3],
    #     'max_features': [1, 2, 3],
    # }

    # GradientBoostingRegressor: 0.11971435285550537 -> Khá tốt do XGBoost là phiên bản kế thừa tốt hơn từ GBR
    'GradientBoostingRegressor': {
        'n_estimators': [100, 300],
        'learning_rate': [0.1, 0.3],
        'max_depth': [6, 9],
    }
}

# 3-fold cross-validation
cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Train and tune the models
grids = {}
for model_name, model in models.items():
    #print(f'Training and tuning {model_name}...')
    grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grids[model_name].fit(X_train, y_train)
    best_params = grids[model_name].best_params_
    best_score = np.sqrt(-1 * grids[model_name].best_score_)
    
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best RMSE for {model_name}: {best_score}\n')

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters for LinearRegression: {}
Best RMSE for LinearRegression: 563803578.1277817

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}
Best RMSE for XGBoost: 0.13215399254451285

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters for GradientBoostingRegressor: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 300}
Best RMSE for GradientBoostingRegressor: 0.13086242943106377



In [12]:
from sklearn.metrics import mean_squared_error
for i in grids.keys():
    print (i + ': ' + str(np.sqrt(mean_squared_error(grids[i].predict(X_test), y_test))))

LinearRegression: 146856413.17207763
XGBoost: 0.12162723233276393
GradientBoostingRegressor: 0.12313718096841858


In [13]:
from sklearn.preprocessing import FunctionTransformer

# feature engineering functions
# Tạo thêm các features giúp cho model đã dùng dự đoán tốt hơn
# Giúp cải thiện độ chính xác và tốt độ của mô hình Từ đó cho chúng ta kết quả tốt hơn so với việc không dụng các features custom

def custom_features(df):
    df_out = df.copy()
    df_out['PropertyAge'] = df_out['YrSold'] - df_out['YearBuilt']
    df_out['TotalSF'] = df_out['TotalBsmtSF'] + df_out['1stFlrSF'] + df_out['2ndFlrSF']
    df_out['TotalBath'] = df_out['FullBath'] + 0.5 * df_out['HalfBath'] + df_out['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
    df_out['HasRemodeled'] = (df_out['YearRemodAdd'] != df_out['YearBuilt']).astype(object)
    df_out['Has2ndFloor'] = (df_out['2ndFlrSF'] > 0).astype(object)
    df_out['HasGarage'] = (df_out['GarageArea'] > 0).astype(object)
    df_out['YrSold_cat'] = df_out['YrSold'].astype(object)
    df_out['MoSold_cat'] = df_out['MoSold'].astype(object)
    df_out['YearBuilt_cat'] = df_out['YearBuilt'].astype(object)
    df_out['MSSubClass_cat'] = df_out['MSSubClass'].astype(object)
    
    return df_out

feature_engineering_transformer = FunctionTransformer(custom_features)


In [14]:
# Identify categorical and numerical columns
new_cols_categorical = pd.Index(['HasRemodeled', 'Has2ndFloor', 'HasGarage'])
new_cols_numeric = pd.Index(['PropertyAge', 'TotalSF', 'TotalBath', 'YrSold_cat', 'MoSold_cat', 'YearBuilt_cat', 'MSSubClass_cat'])

# Update categorical and numerical columns
categorical_columns = df_train.select_dtypes(include=['object', 'category']).columns.append(new_cols_categorical)
numerical_columns = df_train.select_dtypes(include=['int64', 'float64']).columns.append(new_cols_numeric)

# Remove target variable from numerical columns
numerical_columns = numerical_columns.drop('SalePrice')

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ],remainder = 'passthrough')

# Create a pipeline with the preprocessor
pipeline_fe = Pipeline(steps=[
    ('fe', feature_engineering_transformer),
    ('preprocessor', preprocessor)])

# Apply the pipeline to your dataset
X = df_train.drop('SalePrice', axis=1)
y = np.log(df_train['SalePrice'])
X_preprocessed_fe = pipeline_fe.fit_transform(X)

In [22]:
X_preprocessed_fe

array([[ 0.07374384, -0.21878353, -0.28491195, ...,  1.        ,
         0.        ,  1.        ],
       [-0.8768075 ,  0.54027626, -0.03262818, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.07374384, -0.06697157,  0.32934419, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.8768075 ,  0.79329619,  0.75164529, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.31138168, -0.16817954, -0.15504066, ...,  1.        ,
         0.        ,  1.        ],
       [-0.8768075 , -0.06697157, -0.00696105, ...,  0.        ,
         0.        ,  1.        ]])

In [26]:
X_train_fe, X_test_fe, y_train_fe, y_test_fe = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

# Define the models
models = {
    #'LinearRegression': LinearRegression(),
    #'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# Define the hyperparameter grids for each model
param_grids = {
    #'LinearRegression': {},
    # 'RandomForest': {
    #     'n_estimators': [100, 200, 500],
    #     'max_depth': [None, 10, 30],
    #     'min_samples_split': [2, 5, 10],
    # },
    'XGBoost': {
        'n_estimators': [200, 500],
        'learning_rate': [0.2, 0.5],
        'max_depth': [6, 9],
    }
}

# 3-fold cross-validation
cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Train and tune the models
grids_fe = {}
for model_name, model in models.items():
    #print(f'Training and tuning {model_name}...')
    grids_fe[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grids_fe[model_name].fit(X_train_fe, y_train_fe)
    best_params = grids_fe[model_name].best_params_
    best_score = np.sqrt(-1 * grids_fe[model_name].best_score_)
    
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best RMSE for {model_name}: {best_score}\n')

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 200}
Best RMSE for XGBoost: 0.1328984359933282



In [18]:
df_test = pd.read_csv('../data/processed/test_processed.csv')
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,No,Reg,Lvl,AllPub,...,120,0,No,MnPrv,No,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,No,IR1,Lvl,AllPub,...,0,0,No,No,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,No,IR1,Lvl,AllPub,...,0,0,No,MnPrv,No,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,No,IR1,Lvl,AllPub,...,0,0,No,No,No,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,No,IR1,HLS,AllPub,...,144,0,No,No,No,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,No,Reg,Lvl,AllPub,...,0,0,No,No,No,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,No,Reg,Lvl,AllPub,...,0,0,No,No,No,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,No,Reg,Lvl,AllPub,...,0,0,No,No,No,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,No,Reg,Lvl,AllPub,...,0,0,No,MnPrv,Shed,700,7,2006,WD,Normal


In [19]:
df_test_preprocessed = pipeline_fe.transform(df_test)

In [None]:
# from sklearn.metrics import accuracy_score

# # accuracy
# y_train_pred = grids_fe['XGBoost'].predict(X_train_fe)
# print(f"Accuracy on train: {accuracy_score(list(y_train_fe), list(y_train_pred)):.2f}")

# y_pred = grids_fe['XGBoost'].predict(X_test_fe)
# print(f"Accuracy on test: {accuracy_score(list(y_test_fe), list(y_pred)):.2f}")

In [20]:
#xgboost submission
y_xgboost = np.exp(grids_fe['XGBoost'].predict(df_test_preprocessed))

df_xgboost_out = df_test[['Id']].copy()
df_xgboost_out['SalePrice'] = y_xgboost

#
df_xgboost_out.to_csv('submission.csv', index=False)