In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor


df = pd.read_csv("data_model.csv")
df = df.drop(columns=df.columns.difference( ['Carrosserie', 'masse_ordma_min', 'masse_ordma_max', 'co2'] ))
df.columns

Index(['Carrosserie', 'co2', 'masse_ordma_min', 'masse_ordma_max'], dtype='object')

In [2]:
le = LabelEncoder()
df['Carrosserie'] = le.fit_transform(df['Carrosserie'])
df['Carrosserie'].unique()

array([ 0,  1,  5,  2, 10,  3,  7,  9,  8,  6,  4])

In [3]:
scaler = StandardScaler()

df[['masse_ordma_min', 'masse_ordma_max']] = scaler.fit_transform(df[['masse_ordma_min', 'masse_ordma_max']])

In [4]:
X = df[['Carrosserie', 'masse_ordma_min', 'masse_ordma_max']]
y = df['co2']


models = [
    DummyRegressor(),  # A model that always predicts the mean of the target variable
    LinearRegression(),  # A simple linear regression model
    SGDRegressor(),  # A linear model with stochastic gradient descent optimization
    RandomForestRegressor(),  # A random forest regression model
    GradientBoostingRegressor(),  # A gradient boosting regression model
    XGBRegressor()  # An optimized gradient boosting regression model
]


params = [
    {'strategy': ['mean'], 'constant': [0]},  # DummyRegressor hyperparameters
    {'fit_intercept': [True, False]},  # LinearRegression hyperparameters
    {'alpha': [0.01, 0.001], 'penalty': ['l1', 'l2']},  # SGDRegressor hyperparameters
    {'n_estimators': [150, 250], 'max_depth': [None, 2], 'min_samples_split': [7, 9]},  # RandomForestRegressor hyperparameters
    {'n_estimators': [50, 100], 'learning_rate': [0.1, 0.01], 'max_depth': [3, 5]},  # GradientBoostingRegressor hyperparameters
    {'learning_rate': [0.1, 0.01], 'max_depth': [5, 7], 'n_estimators': [50, 100]}  # XGBRegressor hyperparameters
]

In [5]:
X.sample(6)

Unnamed: 0,Carrosserie,masse_ordma_min,masse_ordma_max
20158,6,0.093972,-0.368863
26457,6,1.642771,0.610469
37458,6,0.284175,0.032309
37087,6,-0.429088,-0.628445
45791,6,-0.25247,1.084795
1126,5,-0.44607,-0.774755


In [6]:
for i, (model_grid, param_grid) in enumerate(zip(models, params)):
    grid_search = GridSearchCV(model_grid, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)
    print("--------------------------------------------------")
    print(f"Model {i+1}: {type(model_grid).__name__}")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best score: {-grid_search.best_score_}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y, grid_search.predict(X)))}")
    print(f"R2 score: {grid_search.best_estimator_.score(X, y)}")
    print("--------------------------------------------------\n")

--------------------------------------------------
Model 1: DummyRegressor
Best parameters: {'constant': 0, 'strategy': 'mean'}
Best score: 1239.004282728954
RMSE: 34.01397602132309
R2 score: 0.0
--------------------------------------------------

--------------------------------------------------
Model 2: LinearRegression
Best parameters: {'fit_intercept': True}
Best score: 682.418832385313
RMSE: 25.3132976528585
R2 score: 0.44616212864330007
--------------------------------------------------

--------------------------------------------------
Model 3: SGDRegressor
Best parameters: {'alpha': 0.01, 'penalty': 'l2'}
Best score: 675.1066053489965
RMSE: 25.394123339179064
R2 score: 0.4426196591307764
--------------------------------------------------

--------------------------------------------------
Model 4: RandomForestRegressor
Best parameters: {'max_depth': None, 'min_samples_split': 9, 'n_estimators': 250}
Best score: 617.5528558762076
RMSE: 17.887651173635533
R2 score: 0.7234384300

In [7]:
best_model_name = type(grid_search.best_estimator_).__name__
print(f"The best model is {best_model_name}")

best_params = grid_search.best_params_
print("Hyperparameters used by the best model:")
for param, value in best_params.items():
    print(f"{param}: {value}")

The best model is XGBRegressor
Hyperparameters used by the best model:
learning_rate: 0.1
max_depth: 5
n_estimators: 50


Based on the results of the grid search, the `XGBRegressor` model performed the best with a root mean squared error (RMSE) of 20.38 and an R2 score of 0.64. The `RandomForestRegressor` model also performed well with an RMSE of 17.88 and an R2 score of 0.72.

The `XGBRegressor` model is an optimized gradient boosting regression model that uses a combination of decision trees and gradient boosting to make predictions. It performed well in this case because it was able to capture the non-linear relationships between the input features and the target variable.

The `RandomForestRegressor` model is a random forest regression model that uses an ensemble of decision trees to make predictions. It performed well in this case because it was able to capture the complex interactions between the input features and the target variable.

Overall, both models performed well and could be used for making predictions on new data.