# Import Data + Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor,BaggingRegressor,AdaBoostRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, RobustScaler, PolynomialFeatures, TargetEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from category_encoders import LeaveOneOutEncoder
import joblib

#from catboost import CatBoostRegressor, Pool
#import lightgbm as lgb
#import xgboost as xgb
#from pytorch_tabnet.tab_model import TabNetRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style="ticks", palette="pastel")


In [2]:
data_train= pd.read_csv("data/train.csv",sep=",",low_memory=False)
data_test = pd.read_csv("data/test.csv",sep=",",low_memory=False)

# Preprocessing



## Récupération d'observations

Lorsque la voiture est électrique: on peut se permettre de set `ec(cm3)`, `Fuel consumption `, `z (Wh/km)` à 0

lorsque la voiture n'est pas hybride / électrique : on peut mettre `Electric range (km)` à 0

A voir si on choisir de prendre le traitement.

In [5]:
def group_fuel_types(category: str):
    if category in ['PETROL/ELECTRIC', 'DIESEL/ELECTRIC']:
        return "HYBRID"
    elif category in ['NG-BIOMETHANE', 'HYDROGEN', 'NG','E85']:
        return "BIO-FUEL"
    elif category in ['PETROL','LPG'] :
        return 'PETROL'
    else:
        return category   

def recup_electric(df):
    #  ec (cm3)
    df.loc[(df["Ft"].apply(group_fuel_types)=="ELECTRIC") & (df["ec (cm3)"].isna()),"ec (cm3)"] = 0
    #  Fm
    df.loc[(df["Ft"].apply(group_fuel_types) =="ELECTRIC") & (df["Fm"].isna()),"Fm"] = "E"
    df.loc[(df["Ft"].apply(group_fuel_types) =="HYBRID") & (df["Fm"].isna()),"Fm"] = "P"

    # Electric range (km)
    df.loc[~(df["Ft"].apply(group_fuel_types).isin(["ELECTRIC", "HYBRID"])) & (df["Electric range (km)"].isna()),"Electric range (km)"] = 0

    #  Fuel consumption 
    df.loc[(df["Ft"].apply(group_fuel_types) =="ELECTRIC") & (df["Fuel consumption "].isna()),"Fuel consumption "] = 0

    #  z (Wh/km)
    df.loc[~(df["Ft"].apply(group_fuel_types).isin(["ELECTRIC", "HYBRID"])) & (df["z (Wh/km)"].isna()),"z (Wh/km)"] = 0
    pass

recup_electric(data_train)


## Delete columns

Supprimer les colonnes avec 1 seul valeur unique (aucune info) ou 0 valeur unique (que des NaN)

In [3]:
valeurs_uniques = {}
nombre_val_unique={}
for col in data_train.columns:
    valeurs_uniques[col]=data_train[col].unique().tolist()
    nombre_val_unique[col]=data_train[col].nunique()

for element in nombre_val_unique:
    if nombre_val_unique[element]<=1:
        print(f"colonne supprimée: {element}")
        data_train.drop(columns=element, inplace=True)
        data_test.drop(columns=element, inplace=True)

colonne supprimée: MMS
colonne supprimée: r
colonne supprimée: Ernedc (g/km)
colonne supprimée: De
colonne supprimée: Vf
colonne supprimée: Status


Supprimer les colonnes avec **+ de 50%** de NaN

In [4]:
for col in data_train.columns:
    if (data_train[col].isna().sum()/data_train.shape[0] > 0.5):
        print(f"colonne supprimée: {col}")
        data_train.drop(columns=col, inplace=True)
        data_test.drop(columns=col, inplace=True)


colonne supprimée: Enedc (g/km)
colonne supprimée: z (Wh/km)
colonne supprimée: Electric range (km)


Supprimer les `Date` et `ID` (seulement pour train)

In [5]:
data_train.drop(columns=['Date of registration','ID'], inplace=True)
data_test.drop(columns='Date of registration', inplace=True)

print(f"colonne supprimée pour data_train: Date of registration, ID")
print(f"colonne supprimée pour data_test: Date of registration")

colonne supprimée pour data_train: Date of registration, ID
colonne supprimée pour data_test: Date of registration


In [6]:
col_categoricals = data_test.select_dtypes(include="object").columns.tolist()
col_numericals = [col for col in data_test.columns if col not in col_categoricals]
col_numericals.remove("ID")

## Transformer les doublons en moyenne de Y

En considérant qu'on a drop: `Date of registration`,`ID` on va prendre la moyenne des Y pour lesquelles les caractéristiques sont des doublons.



In [7]:
colus=list(data_train.columns)
colus.remove('Ewltp (g/km)')
duplicates = data_train.drop(columns="Ewltp (g/km)").duplicated(keep=False)

mean_target = data_train[duplicates].groupby(colus)['Ewltp (g/km)'].mean()

# Supprimer les lignes en double et réindexer le DataFrame
data_train = pd.concat([data_train[~duplicates], mean_target.reset_index()], ignore_index=True)
data_train.shape


(1163819, 26)

## Outliers 

Utilisation de l'écart interquartile pour identifier les valeurs aberrantes.

Imputation des outliers:

Fixer les valeurs aberrantes à un certain pourcentage (par exemple, 5e et 95e percentiles).

**on pourrait aussi tenter d'imputer par la médiane si cela n'aboutit pas** 

### windorization of outliers

In [7]:
quantiles={}

def winsorize_outliers(data, column_name, lower_percentile=5, upper_percentile=95,train=True):
    """
    Detects and imputes outliers using winsorizing for a specific column in a DataFrame.

    Parameters:
    - data: Pandas DataFrame, input data
    - column_name: str, name of the column to be winsorized
    - lower_percentile: int, lower percentile for winsorizing (default: 5)
    - upper_percentile: int, upper percentile for winsorizing (default: 95)

    Returns:
    - winsorized_data: Pandas DataFrame, data with outliers winsorized for the specified column
    """

    column_data = data[column_name]
    if train:
        quantiles["q1"] = np.percentile(column_data, lower_percentile)
        quantiles["q3"] = np.percentile(column_data, upper_percentile)
        iqr = quantiles["q3"] - quantiles["q1"]
        quantiles["lower_bound"] = quantiles["q1"] - 1.5 * iqr
        quantiles["upper_bound"] = quantiles["q3"] + 1.5 * iqr

    data[column_name] = np.clip(column_data, quantiles["lower_bound"], quantiles["upper_bound"])

    return data

for col in col_numericals:
    data_train=winsorize_outliers(data_train,col)
    data_test =winsorize_outliers(data_test,col,train=False)

### Median imputing

In [9]:
quantiles={}

def replace_outliers_with_median(data, column_name,train=True):
    """
    Detects and replaces outliers with the median for a specific column in a DataFrame.

    Parameters:
    - data: Pandas DataFrame, input data
    - column_name: str, name of the column to be processed
    - train: bool

    Returns:
    - data_with_median: Pandas DataFrame, data with outliers replaced by the median for the specified column
    """

    column_data = data[column_name]

    if train:
        q1 = np.percentile(column_data,25)
        q3 = np.percentile(column_data, 75)
        iqr = q3 - q1
        quantiles["lower_bound"] = q1 - 1.5 * iqr
        quantiles["upper_bound"] = q3 + 1.5 * iqr
        quantiles[column_name]=column_data.median()


    outliers_mask = (column_data < quantiles["lower_bound"]) | (column_data > quantiles["upper_bound"])

    data.loc[outliers_mask, column_name] = quantiles[column_name]
    return data


for col in col_numericals:
    data_train = replace_outliers_with_median(data_train,col)
    data_train =replace_outliers_with_median(data_test,col,train=False)

## Impute NaN by median/mode



In [9]:
imputers={}
_coefficient_variation= lambda series : series.std()/series.mean()

def fill_missing_values(colname : str,data:pd.DataFrame) -> None:
    
    if data[colname].dtype in ["float64"]:
        if _coefficient_variation(data[colname]) > 0.15 :
            imputers[colname]=SimpleImputer(missing_values=np.nan,strategy="median")
        else:
            imputers[colname]=SimpleImputer(missing_values=np.nan,strategy="mean")
    else:
        imputers[colname]=SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imputers[colname].fit(data[colname].to_numpy().reshape(-1,1))
    pass

for col in data_test.columns[1:]:
    fill_missing_values(col,data_train)
    data_train[col]=pd.Series(imputers[col].transform(data_train[col].to_numpy().reshape(-1,1)).flatten())
    data_test[col]=pd.Series(imputers[col].transform(data_test[col].to_numpy().reshape(-1,1)).flatten())


## Encode categorical columns

Many choices:
- Label/Ordinal encoding
- Target encoding
- Impact encoding


Label/Ordinal encoding

In [10]:
encoders={}
def ordinal_encoder(colname:str,data:pd.DataFrame,train=True):
    if train:
        encoders[colname]=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        data[colname]=encoders[colname].fit_transform(data[[colname]])
        pass
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
        pass

for col in col_categoricals:
    ordinal_encoder(col,data_train)
    ordinal_encoder(col,data_test,False)

Target Encoding

In [10]:
encoders={}
def target_encoder(colname:str,data:pd.DataFrame,train=True):
    if train:
        encoders[colname]=TargetEncoder(target_type='continuous', smooth='auto',random_state=42)
        data[colname]=encoders[colname].fit_transform(data[[colname]],data['Ewltp (g/km)'])
        pass
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
        pass

for col in col_categoricals:
    target_encoder(col,data_train)
    target_encoder(col,data_test,False)

Impact Encoding

Proposé par Sam B. J'ai utilisé Chat GPT pour l'implémenter honnêtement.

In [10]:

train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

encoders={}
def impact_encoder(colname:str,data:pd.DataFrame,target=None,train=True):
    if train:
        encoders[colname]=LeaveOneOutEncoder(handle_unknown="value")
        data[colname]=encoders[colname].fit_transform(data[[colname]],target)
        pass
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
        pass

for col in col_categoricals:
    impact_encoder(col,X_train,target=y_train)
    impact_encoder(col,X_test,train=False)
    impact_encoder(col,data_test,train=False)

## Feature Engineering

Doit-on utiliser ces variables ?

In [9]:
def create_conforme(df):
    df['conforme'] = df['Tan'].isna()
    df['conforme'] = df['conforme'].apply(lambda x: 1 if x==False else 0)
    pass

create_conforme(data_train)
create_conforme(data_test)


In [12]:
def compute_surface(obs):
    max_largeur= max(obs['At1 (mm)'], obs['At2 (mm)'])
    return obs['W (mm)']*obs['At1 (mm)'] if max_largeur == obs['At1 (mm)'] else obs['W (mm)'] * obs['At2 (mm)']

def create_surface(df):
    df['surface']= df.apply(compute_surface, axis=1)
    pass

create_surface(data_train)
create_surface(data_test)


## Split 

In [11]:
train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

# Model Testing

## Random Forest 1

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering.

Computation time: 41min

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.8986731992878796


In [13]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("data/new_simple_rf.csv", index=False)

In [15]:
joblib.dump(random_forest, 'models/random_forest_simple_model.joblib')

['random_forest_simple_model.joblib']

## Random Forest 2

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

**target** encoding. Strandard fill NaN. Random forest. No feature Engineering.

Computation time: 23min (Home)

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.986984486605863


In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_target_rf.csv", index=False)

## Random Forest 3

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

**impact** encoding. Strandard fill NaN. Random forest. No feature Engineering.

Computation time: 22 (Home)

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 26.539493069959914


In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_impact_rf.csv", index=False)

## Random Forest 5

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **use mean of Y**

Computation time: 1.33min (Home)

In [13]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.6239323258632914


In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_mean_rf.csv", index=False)

## Random Forest 4

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **feature engineering variables (conformes, surface)**

Computation time: 23min (Home)

In [14]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.8998117328283994


In [15]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_fe_rf.csv", index=False)

## Bagging 1
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. estimator = DecistionTreeRegressor

Computation time: 2,14min (Home)

In [12]:
# TODO: remplacer par RandomForestRegressor(), SVR()
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

  warn(


Mean Absolute Error (MAE): 2.929723205821804


In [16]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_tree_bagging.csv", index=False)

In [17]:
bag.get_params()

{'base_estimator': 'deprecated',
 'bootstrap': True,
 'bootstrap_features': False,
 'estimator__ccp_alpha': 0.0,
 'estimator__criterion': 'squared_error',
 'estimator__max_depth': None,
 'estimator__max_features': None,
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__random_state': None,
 'estimator__splitter': 'best',
 'estimator': DecisionTreeRegressor(),
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'oob_score': True,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

## Bagging 2
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 20**

Computation time: 5min (Home)

In [13]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42,verbose=3,n_estimators=20)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:  4.4min remaining: 39.8min
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:  4.5min remaining:  5.5min
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:  4.5min remaining:  1.1min
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:  4.5min finished
  warn(
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    5.4s remaining:   49.1s
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:    5.9s remaining:    7.2s
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:    6.3s remaining:    1.5s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    6.5s finished


Mean Absolute Error (MAE): 2.9137636972220546


In [14]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_20_estim_bagging.csv", index=False)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    1.8s remaining:   17.0s
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:    2.6s remaining:    3.3s
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:    3.1s remaining:    0.7s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    3.3s finished


## Bagging 3
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 30**

Computation time: 7min (Home)

In [26]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42,verbose=3,n_estimators= 30)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:  5.3min remaining: 26.7min
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:  5.5min remaining:  4.7min
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:  6.7min remaining:   36.2s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  6.7min finished
  warn(
[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:    7.2s remaining:   36.2s
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:    8.2s remaining:    6.9s
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:    9.8s remaining:    0.8s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:    9.9s finished


Mean Absolute Error (MAE): 2.9082315288155627


In [27]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_30_estim_bagging.csv", index=False)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:    3.5s remaining:   18.1s
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:    4.3s remaining:    3.6s
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:    4.8s remaining:    0.3s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:    4.9s finished


## Bagging 3
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 50**

Computation time: 13min (Home)

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42,verbose=3,n_estimators= 50)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed: 10.8min remaining: 53.9min
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed: 11.0min remaining:  9.3min
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed: 11.2min remaining:  1.0min
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed: 11.8min finished
[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:   12.2s remaining:  1.0min
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:   14.0s remaining:   11.9s
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:   15.6s remaining:    1.3s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:   15.7s finished


Mean Absolute Error (MAE): 2.9028932743761215


In [13]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_50_estim_bagging.csv", index=False)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:    5.7s remaining:   28.8s
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:    7.5s remaining:    6.4s
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:    8.8s remaining:    0.7s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:    9.0s finished


## Bagging 4
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 10, bootstrap_features= True**

Computation time: 2,15min (Home)

In [14]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42,verbose=3,n_estimators= 10,bootstrap_features=True)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:  1.7min remaining:  4.0min
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:  1.8min remaining:   46.2s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:  2.0min finished
  warn(
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    2.9s remaining:    6.9s
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:    3.2s remaining:    1.3s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    3.3s finished


Mean Absolute Error (MAE): 2.94623303681266


## Bagging 5
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 20, oob_score=False**

Computation time: 4.35min (Home)

In [16]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=False, random_state=42,verbose=3,n_estimators= 20)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:  4.4min remaining: 39.3min
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:  4.4min remaining:  5.4min
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:  4.4min remaining:  1.1min
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:  4.5min finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    5.5s remaining:   50.5s
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:    5.9s remaining:    7.2s
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:    6.4s remaining:    1.5s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    6.7s finished


Mean Absolute Error (MAE): 2.9137636972220546


## Adaboost

In [21]:
boosting = AdaBoostRegressor(random_state=42,loss='exponential')

boosting.fit(X_train, y_train)

y_pred = boosting.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 56.538170887529546


In [None]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_simple_boosting.csv", index=False)

## Decision Tree Grid Search

In [12]:
decision_tree_model = DecisionTreeRegressor()

param_grid = {
    'criterion': ['absolute_error', 'poisson', 'squared_error'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]}

grid_search = GridSearchCV(decision_tree_model, param_grid, cv=5, scoring='neg_mean_absolute_error',verbose=3)

grid_search.fit(X_train, y_train)
print("Meilleurs hyperparamètres:", grid_search.best_params_)

y_pred = grid_search.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error sur les données de test:", mae)


## Baggin Grid Search

In [None]:
base_model_params = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    
}

# Définir les hyperparamètres spécifiques au Bagging
bagging_params = {
    'n_estimators': [20,50,100],
    'max_samples': [1.0, 0.8, 0.9],  # La fraction d'échantillons à utiliser pour chaque sac
    'bootstrap': [True, False]
}

# Créer un modèle DecisionTreeRegressor pour être utilisé comme modèle de base
base_model = DecisionTreeRegressor(random_state=42)

# Créer un modèle BaggingRegressor
bagging_model = BaggingRegressor(base_model, random_state=42)

# Combiner les deux ensembles de paramètres
param_grid = {**base_model_params, **bagging_params}

# Utiliser GridSearchCV
grid_search = GridSearchCV(
    estimator=bagging_model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Utilisez 'neg_mean_absolute_error' pour la régression
    cv=3,
    verbose=3)

grid_search.fit(X_train, y_train)
print("Meilleurs paramètres : ", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE) sur les données de test : ", mae)

## Xgboost 1

## Catboost 1

computing time 49min

In [15]:

catb = CatBoostRegressor(iterations=10000,  # Nombre d'itérations (peut être ajusté)
                          depth=6,  # Profondeur de l'arbre (peut être ajusté)
                          learning_rate=0.1,  # Taux d'apprentissage (peut être ajusté)
                          loss_function='RMSE',  # Fonction de perte pour la régression
                          random_seed=42,
                          task_type='CPU',
                          thread_count=-1)

catb.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

predictions = catb.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

0:	learn: 166.0466744	test: 166.0927154	best: 166.0927154 (0)	total: 329ms	remaining: 54m 52s
100:	learn: 19.9021623	test: 19.8023282	best: 19.8023282 (100)	total: 30.2s	remaining: 49m 16s
200:	learn: 16.3895228	test: 16.3450710	best: 16.3450710 (200)	total: 60s	remaining: 48m 44s
300:	learn: 14.6005017	test: 14.5744841	best: 14.5744841 (300)	total: 1m 29s	remaining: 48m 6s
400:	learn: 13.3583934	test: 13.3527044	best: 13.3527044 (400)	total: 1m 58s	remaining: 47m 28s
500:	learn: 12.4735322	test: 12.4949177	best: 12.4949177 (500)	total: 2m 28s	remaining: 46m 51s
600:	learn: 11.8248216	test: 11.8637427	best: 11.8637427 (600)	total: 2m 57s	remaining: 46m 19s
700:	learn: 11.3208447	test: 11.3706236	best: 11.3706236 (700)	total: 3m 27s	remaining: 45m 52s
800:	learn: 10.8895149	test: 10.9501880	best: 10.9501880 (800)	total: 3m 57s	remaining: 45m 23s
900:	learn: 10.5598394	test: 10.6378579	best: 10.6378579 (900)	total: 4m 26s	remaining: 44m 53s
1000:	learn: 10.2236002	test: 10.3213196	best: 

In [17]:
data_test["Ewltp (g/km)"] = catb.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_simple_catboosting.csv", index=False)

## Catboost 2

https://towardsdatascience.com/catboost-regression-in-6-minutes-3487f3e5b329

### Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_train.drop(columns="Ewltp (g/km)"), data_train["Ewltp (g/km)"], test_size = 0.2, random_state=42)

train_dataset = Pool(X_train, y_train) 
test_dataset = Pool(X_test, y_test)

In [None]:

catb = CatBoostRegressor(iterations=10000,  # Nombre d'itérations (peut être ajusté)
                          depth=6,  # Profondeur de l'arbre (peut être ajusté)
                          learning_rate=0.1,  # Taux d'apprentissage (peut être ajusté)
                          loss_function='RMSE',  # Fonction de perte pour la régression
                          random_seed=42,
                          task_type='CPU',
                          thread_count=-1)

grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}

catb.grid_search(grid, train_dataset)

predictions = catb.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)