<a href="https://colab.research.google.com/github/LeRoiBof/ML-Project/blob/main/ML1_Project_T4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project - Machine Learning - Gradient Boosting Regression with XGBoost

## Import libraries

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import re, math
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

## Data loading


In [28]:
df_train = pd.read_csv('train.csv')

y = df_train['LotArea']
X = df_train.drop(columns='LotArea')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = pd.read_csv('test.csv')
X_test_id = X_test['ID']
X_test = X_test.drop(columns='ID')

X_train.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
Street            object
Alley             object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 78, dtype: object

## Preprocessing

### Retirer ces paramètres : Street, Alley, LandContour, Utilities, LandSlope, Condition1, Condition 2, OverallQual, OverallCond, YearBuilt, YearRemodAdd, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, HeatingQC, KitchenQual, Functional, FireplaceQu, GarageYrBlt, GarageFinish, GaraQual, GarageCond, PavedDrive, PoolQC, Fence, MoSold, YrSold, SaleType, SaleCondition

In [29]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

excluded_columns = ['Street', 'Alley', 'LandContour', 'Utilities', 'LandSlope', 'Condition1', 'Condition2', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']

# included_columns = [col for col in X_train.columns if col not in excluded_columns]

numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

print(numeric_features)
print(categorical_features)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', GradientBoostingRegressor(random_state=42))])

Index(['MSSubClass', 'LotFrontage', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Ele

## Searching for the best hyperparameters

In [30]:
# Définition de la grille des hyperparamètres à rechercher
param_grid = {
    'regressor__n_estimators': [100, 500, 1000],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Recherche des meilleurs hyperparamètres
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Affichage des meilleurs hyperparamètres
print("Best parameters found:", grid_search.best_params_)

# Prédiction sur l'ensemble de validation
y_pred = grid_search.predict(X_val)

# Calcul du RMSE
rmse = mean_squared_error(y_val, y_pred, squared=False)
print("Root Mean Squared Error (RMSE) on validation set:", rmse)

KeyboardInterrupt: 

## Fitting and prediction

In [25]:
final_model = grid_search.best_estimator_
final_model.fit(X, y)
y_test = final_model.predict(X_test)


## Submission

In [26]:
submission = pd.DataFrame({
    'ID': X_test_id,
    'LotArea': y_test
})
submission.to_csv('submissionXGB.csv', index=False)
submission

Unnamed: 0,ID,LotArea
0,0,8266.592143
1,1,12325.641589
2,2,13580.340337
3,3,16106.726190
4,4,13445.293933
...,...,...
1455,1455,7923.923953
1456,1456,6779.906421
1457,1457,13965.096628
1458,1458,8741.057152
