# Project - Machine Learning - Gradient Boosting Regression with XGBoost

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re, math
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## Data loading

In [2]:
df_train = pd.read_csv('train.csv')

excluded_columns = ['Utilities', 'Condition1', 'Condition2', 'HouseStyle', 'YearBuilt',
       'YearRemodAdd', 'Exterior1st', 'Exterior2nd', 'ExterCond', 'Foundation',
       'BsmtCond', 'BsmtFinType2', 'BsmtUnfSF', 'HeatingQC', 'CentralAir',
       '2ndFlrSF', 'LowQualFinSF', 'BsmtHalfBath', 'HalfBath', 'KitchenAbvGr',
       'GarageYrBlt', 'GarageFinish', 'PavedDrive', 'EnclosedPorch',
       '3SsnPorch', 'Fence', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition']

for element in excluded_columns:
  df_train=df_train.drop(columns=element)


y = df_train['LotArea']
X = df_train.drop(columns='LotArea')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = pd.read_csv('test.csv')
X_test_id = X_test['ID']
X_test = X_test.drop(columns='ID')

X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,GarageCars,GarageArea,GarageQual,GarageCond,WoodDeckSF,OpenPorchSF,ScreenPorch,PoolArea,PoolQC,MiscVal
254,20,RL,,Pave,,Reg,Lvl,Inside,Gtl,Sawyer,...,0.0,0.0,,,0,24,0,0,,0
1065,190,RL,60.0,Pave,,Reg,Lvl,Inside,Gtl,SWISU,...,0.0,0.0,,,0,0,0,0,,0
864,50,RL,50.0,Pave,,Reg,Lvl,Inside,Gtl,Edwards,...,1.0,280.0,TA,TA,0,0,0,0,,0
798,20,RL,70.0,Pave,,Reg,Lvl,Inside,Gtl,NAmes,...,1.0,308.0,TA,TA,0,0,0,0,,0
380,30,RM,60.0,Pave,,IR1,Lvl,Inside,Gtl,BrkSide,...,1.0,240.0,Fa,TA,49,0,0,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,20,RL,50.0,Pave,,IR1,HLS,CulDSac,Gtl,Gilbert,...,3.0,690.0,TA,TA,144,60,0,0,,0
1130,20,RL,80.0,Pave,,Reg,Lvl,Inside,Gtl,Edwards,...,2.0,569.0,TA,TA,0,189,348,0,,0
1294,70,RM,60.0,Pave,Grvl,Reg,Lvl,Inside,Gtl,OldTown,...,0.0,0.0,,,344,0,168,0,,0
860,60,RL,,Pave,,IR2,Lvl,CulDSac,Gtl,SawyerW,...,2.0,453.0,TA,TA,168,98,0,0,,0


## Preprocessing

In [3]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

print(numeric_features)
print(categorical_features)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


Index(['MSSubClass', 'LotFrontage', 'OverallQual', 'OverallCond', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea',
       'BsmtFullBath', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'ScreenPorch', 'PoolArea', 'MiscVal'],
      dtype='object')
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
       'LandSlope', 'Neighborhood', 'BldgType', 'RoofStyle', 'RoofMatl',
       'MasVnrType', 'ExterQual', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1',
       'Heating', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
       'GarageType', 'GarageQual', 'GarageCond', 'PoolQC'],
      dtype='object')


##Adding param to model

In [5]:
params = {
    'task' : 'train',
    'objective' : 'regression',
    'subsample' : 0.8,
    'max_depth' : 7
}
final_model = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LGBMRegressor(**params, verbose=-1))])


final_model.fit(X_train, y_train)

# Prédiction sur l'ensemble de validation
y_pred = final_model.predict(X_val)

# Calcul du RMSE
rmse = mean_squared_error(np.log(y_val), np.log(y_pred), squared=False)
print("Root Mean Squared Error (RMSE) on validation set:", rmse)

Root Mean Squared Error (RMSE) on validation set: 0.27617250293350876


##Fitting model

In [None]:
final_model.fit(X, y)
y_test = final_model.predict(X_test)

##Submission

In [None]:
submission = pd.DataFrame({
    'ID': X_test_id,
    'LotArea': y_test
})
submission.to_csv('submissionLGBM.csv', index=False)
submission