# Import Data + Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor,BaggingRegressor,AdaBoostRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, RobustScaler, PolynomialFeatures, TargetEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from category_encoders import LeaveOneOutEncoder
from category_encoders.count import CountEncoder
from category_encoders.cat_boost import CatBoostEncoder
import joblib

from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import xgboost as xgb
from xgboost.callback import EarlyStopping, LearningRateScheduler

from pytorch_tabnet.tab_model import TabNetRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style="ticks", palette="pastel")

In [2]:
data_train= pd.read_csv("data/train.csv",sep=",",low_memory=False)
data_test = pd.read_csv("data/test.csv",sep=",",low_memory=False)

data_train.name="data_train"
data_test.name="data_test"

# Preprocessing



## Récupération d'observations

Lorsque la voiture est électrique: on peut se permettre de set `ec(cm3)`, `Fuel consumption `, `z (Wh/km)` à 0

lorsque la voiture n'est pas hybride / électrique : on peut mettre `Electric range (km)` à 0

A voir si on choisir de prendre le traitement.

In [3]:
def group_fuel_types(category: str):
    if category in ['PETROL/ELECTRIC', 'DIESEL/ELECTRIC']:
        return "HYBRID"
    elif category in ['NG-BIOMETHANE', 'HYDROGEN', 'NG','E85']:
        return "BIO-FUEL"
    elif category in ['PETROL','LPG'] :
        return 'PETROL'
    else:
        return category   

def recup_electric(df):
    #  ec (cm3)
    df.loc[(df["Ft"].apply(group_fuel_types)=="ELECTRIC") & (df["ec (cm3)"].isna()),"ec (cm3)"] = 0
    #  Fm
    df.loc[(df["Ft"].apply(group_fuel_types) =="ELECTRIC") & (df["Fm"].isna()),"Fm"] = "E"
    df.loc[(df["Ft"].apply(group_fuel_types) =="HYBRID") & (df["Fm"].isna()),"Fm"] = "P"

    # Electric range (km)
    df.loc[~(df["Ft"].apply(group_fuel_types).isin(["ELECTRIC", "HYBRID"])) & (df["Electric range (km)"].isna()),"Electric range (km)"] = 0

    #  Fuel consumption 
    df.loc[(df["Ft"].apply(group_fuel_types) =="ELECTRIC") & (df["Fuel consumption "].isna()),"Fuel consumption "] = 0

    #  z (Wh/km)
    df.loc[~(df["Ft"].apply(group_fuel_types).isin(["ELECTRIC", "HYBRID"])) & (df["z (Wh/km)"].isna()),"z (Wh/km)"] = 0
    pass

recup_electric(data_train)
recup_electric(data_test)

Run if you want to use **mean_group_imputer** then run **outliers** and then the **delete columns** finally the mean_group_imputer

In [3]:
def group_fuel_types(category: str):
    if category in ['PETROL/ELECTRIC', 'DIESEL/ELECTRIC']:
        return "HYBRID"
    elif category in ['NG-BIOMETHANE', 'HYDROGEN', 'NG','E85']:
        return "BIO-FUEL"
    elif category in ['PETROL','LPG'] :
        return 'PETROL'
    else:
        return category   
    
def recup_fuel_mode(df):
    df.loc[(df["Ft"].apply(group_fuel_types) =="ELECTRIC") & (df["Fm"].isna()),"Fm"] = "E"
    df.loc[(df["Ft"].apply(group_fuel_types) =="HYBRID") & (df["Fm"].isna()),"Fm"] = "P"
    pass

recup_fuel_mode(data_train)
recup_fuel_mode(data_test)

## Delete columns

Supprimer les colonnes avec 1 seul valeur unique (aucune info) ou 0 valeur unique (que des NaN)

In [4]:
valeurs_uniques = {}
nombre_val_unique={}
for col in data_train.columns:
    valeurs_uniques[col]=data_train[col].unique().tolist()
    nombre_val_unique[col]=data_train[col].nunique()

for element in nombre_val_unique:
    if nombre_val_unique[element]<=1:
        print(f"colonne supprimée: {element}")
        data_train.drop(columns=element, inplace=True)
        data_test.drop(columns=element, inplace=True)

colonne supprimée: MMS
colonne supprimée: r
colonne supprimée: Ernedc (g/km)
colonne supprimée: De
colonne supprimée: Vf
colonne supprimée: Status


Supprimer les colonnes avec **+ de 50%** de NaN

In [5]:
for col in data_train.columns:
    if (data_train[col].isna().sum()/data_train.shape[0] > 0.5):
        print(f"colonne supprimée: {col}")
        data_train.drop(columns=col, inplace=True)
        data_test.drop(columns=col, inplace=True)


colonne supprimée: Enedc (g/km)


Supprimer les `Date` et `ID` (seulement pour train)

In [6]:
data_train.drop(columns=['Date of registration','ID'], inplace=True)
data_test.drop(columns='Date of registration', inplace=True)

print(f"colonne supprimée pour data_train: Date of registration, ID")
print(f"colonne supprimée pour data_test: Date of registration")

colonne supprimée pour data_train: Date of registration, ID
colonne supprimée pour data_test: Date of registration


In [7]:
col_categoricals = data_test.select_dtypes(include="object").columns.tolist()
col_numericals = [col for col in data_test.columns if col not in col_categoricals]
col_numericals.remove("ID")

## Transformer les doublons en moyenne de Y

En considérant qu'on a drop: `Date of registration`,`ID` on va prendre la moyenne des Y pour lesquelles les caractéristiques sont des doublons.



In [7]:
colus=list(data_train.columns)
colus.remove('Ewltp (g/km)')
duplicates = data_train.drop(columns="Ewltp (g/km)").duplicated(keep=False)

mean_target = data_train[duplicates].groupby(colus)['Ewltp (g/km)'].mean()

# Supprimer les lignes en double et réindexer le DataFrame
data_train = pd.concat([data_train[~duplicates], mean_target.reset_index()], ignore_index=True)
data_train.shape


(1163819, 26)

## Outliers 

Utilisation de l'écart interquartile pour identifier les valeurs aberrantes.

Imputation des outliers:

Fixer les valeurs aberrantes à un certain pourcentage (par exemple, 5e et 95e percentiles).

**on pourrait aussi tenter d'imputer par la médiane si cela n'aboutit pas** 

### windorization of outliers

In [8]:
quantiles={}

def winsorize_outliers(data, column_name, lower_percentile=5, upper_percentile=95,train=True):
    """
    Detects and imputes outliers using winsorizing for a specific column in a DataFrame.

    Parameters:
    - data: Pandas DataFrame, input data
    - column_name: str, name of the column to be winsorized
    - lower_percentile: int, lower percentile for winsorizing (default: 5)
    - upper_percentile: int, upper percentile for winsorizing (default: 95)

    Returns:
    - winsorized_data: Pandas DataFrame, data with outliers winsorized for the specified column
    """

    column_data = data[column_name]
    if train:
        quantiles["q1"] = np.percentile(column_data, lower_percentile)
        quantiles["q3"] = np.percentile(column_data, upper_percentile)
        iqr = quantiles["q3"] - quantiles["q1"]
        quantiles["lower_bound"] = quantiles["q1"] - 1.5 * iqr
        quantiles["upper_bound"] = quantiles["q3"] + 1.5 * iqr

    data[column_name] = np.clip(column_data, quantiles["lower_bound"], quantiles["upper_bound"])

    return data

for col in col_numericals:
    data_train=winsorize_outliers(data_train,col)
    data_test =winsorize_outliers(data_test,col,train=False)

### Median imputing

In [8]:
quantiles={}

def replace_outliers_with_median(data, column_name,train=True):
    """
    Detects and replaces outliers with the median for a specific column in a DataFrame.

    Parameters:
    - data: Pandas DataFrame, input data
    - column_name: str, name of the column to be processed
    - train: bool

    Returns:
    - data_with_median: Pandas DataFrame, data with outliers replaced by the median for the specified column
    """

    column_data = data[column_name]

    if train:
        q1 = np.percentile(column_data,25)
        q3 = np.percentile(column_data, 75)
        iqr = q3 - q1
        quantiles["lower_bound"] = q1 - 1.5 * iqr
        quantiles["upper_bound"] = q3 + 1.5 * iqr
        quantiles[column_name]=column_data.median()


    outliers_mask = (column_data < quantiles["lower_bound"]) | (column_data > quantiles["upper_bound"])

    data.loc[outliers_mask, column_name] = quantiles[column_name]
    return data


for col in col_numericals:
    data_train = replace_outliers_with_median(data_train,col)
    data_test =replace_outliers_with_median(data_test,col,train=False)

## Impute NaN by median/mode



In [9]:
imputers={}
_coefficient_variation= lambda series : series.std()/series.mean()

def fill_missing_values(colname : str,data:pd.DataFrame) -> None:
    
    if data[colname].dtype in ["float64"]:
        if _coefficient_variation(data[colname]) > 0.15 :
            imputers[colname]=SimpleImputer(missing_values=np.nan,strategy="median")
        else:
            imputers[colname]=SimpleImputer(missing_values=np.nan,strategy="mean")
    else:
        imputers[colname]=SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imputers[colname].fit(data[colname].to_numpy().reshape(-1,1))
    pass

for col in data_test.columns[1:]:
    fill_missing_values(col,data_train)
    data_train[col]=pd.Series(imputers[col].transform(data_train[col].to_numpy().reshape(-1,1)).flatten())
    data_test[col]=pd.Series(imputers[col].transform(data_test[col].to_numpy().reshape(-1,1)).flatten())


group imputer

In [31]:
imputers={}

def mean_group_imputer(col,df,train=True):
    if (df[col].isna().sum()/df[col].shape[0] >0.5) or (df[col].isna().sum()/df[col].shape[0] ==1): #data_train[col].isna().sum()/data_train.shape[0] > 0.5
        pass
    else:
        if df[col].dtype in ["float64"]: # numérique
            if train:
                imputers[col]={}
                for energy in list(df["Ft"].unique()):
                    imputers[col][energy]=(df.loc[(~df[col].isna())& (df["Ft"]==energy),col].median())
                    df.loc[(df[col].isna())& (df["Ft"]==energy),col]= imputers[col][energy]
            else:
                for energy in list(df["Ft"].unique()):
                    df.loc[(df[col].isna())& (df["Ft"]==energy),col]= imputers[col][energy]
        else:
            if train:
                imputers[col]={}
                for energy in list(df["Ft"].unique()):
                    try:
                        imputers[col][energy]=(df.loc[(~df[col].isna())& (df["Ft"]==energy),col].mode()).iloc[0]
                        df.loc[(df[col].isna())& (df["Ft"]==energy),col]= imputers[col][energy]
                    except:
                        imputers[col][energy]="UNKNOWN"
                        df.loc[(df[col].isna())& (df["Ft"]==energy),col]= imputers[col][energy]
            else:
                for energy in list(df["Ft"].unique()):
                    df.loc[(df[col].isna())& (df["Ft"]==energy),col]= imputers[col][energy]
        print(f"Colonne {col} de {df.name} est imputée")
        pass

for col in data_test.columns[1:]:
    mean_group_imputer(col,data_train)
    mean_group_imputer(col,data_test,False)

Colonne Country de data_train est imputée
Colonne Country de data_test est imputée
Colonne VFN de data_train est imputée
Colonne VFN de data_test est imputée
Colonne Mp de data_train est imputée
Colonne Mp de data_test est imputée
Colonne Mh de data_train est imputée
Colonne Mh de data_test est imputée
Colonne Man de data_train est imputée
Colonne Man de data_test est imputée
Colonne Tan de data_train est imputée
Colonne Tan de data_test est imputée
Colonne T de data_train est imputée
Colonne T de data_test est imputée
Colonne Va de data_train est imputée
Colonne Va de data_test est imputée
Colonne Ve de data_train est imputée
Colonne Ve de data_test est imputée
Colonne Mk de data_train est imputée
Colonne Mk de data_test est imputée
Colonne Cn de data_train est imputée
Colonne Cn de data_test est imputée
Colonne Ct de data_train est imputée
Colonne Ct de data_test est imputée
Colonne Cr de data_train est imputée
Colonne Cr de data_test est imputée
Colonne m (kg) de data_train est impu

## Encode categorical columns

Many choices:
- Customized encoding
- Label/Ordinal encoding
- Target encoding
- Impact encoding

Customized encoding. 

- Count Encoder (nunique >=15)
- OHE Encoder (nunique < 15) rajoute 35 colonnes 
```
Mp 10  valeurs uniques
Ct 5  valeurs uniques
Cr 3  valeurs uniques
Ft 11  valeurs uniques
Fm 6  valeurs uniques
```
- Catboost Encoder (nunique>=15) **USES THE TARGET TO FIT**
```
Country 29  valeurs uniques
VFN 8456  valeurs uniques
Mh 95  valeurs uniques
Man 104  valeurs uniques
Tan 6318  valeurs uniques
T 1506  valeurs uniques
Va 5413  valeurs uniques
Ve 25570  valeurs uniques
Mk 694  valeurs uniques
Cn 8323  valeurs uniques
IT 487  valeurs uniques
```

In [10]:
encoders = {}

def cat_boost_encoder(col,df,train=True):
    if train:
        encoders[col]= CatBoostEncoder(random_state=42)
        df[col]=encoders[col].fit_transform(df[[col]],df[['Ewltp (g/km)']])
    else:
        df[col]=encoders[col].transform(df[[col]])
    pass

def ohe_encoder(col,df,train=True):
    if train:
        encoders[col] = OneHotEncoder(sparse_output=False, drop='first',handle_unknown='ignore') #sparse = false sinn jsais pas gérer
        ohe_features = encoders[col].fit_transform(df[[col]])
    else: 
        ohe_features = encoders[col].transform(df[[col]])

    ohe_features = pd.DataFrame(ohe_features, columns=encoders[col].get_feature_names_out([col]))

    df.drop(columns=col, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, ohe_features], axis=1)
    return df

def count_encoder(col, df, train=True):
    if train:
        encoders[col]=CountEncoder(handle_unknown='value')
        df[col]=encoders[col].fit_transform(df[[col]])
    else:
        df[col]=encoders[col].transform(df[[col]])
    pass

def ordinal_encoder(colname:str,data:pd.DataFrame,train=True):
    if train:
        encoders[colname]=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        data[colname]=encoders[colname].fit_transform(data[[colname]])
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
    pass

for col in col_categoricals:
    if nombre_val_unique[col]>=15: #eventually replace by catboost encoder but careful cuz of Target ! (do TTS first)
        count_encoder(col,data_train)
        count_encoder(col,data_test,False)
        print(f"encoding count : {col}")
    else:
        data_train=ohe_encoder(col,data_train) #reassign cuz you don't know how to do it...
        data_test=ohe_encoder(col,data_test, False)
        print(f"encoding OHE : {col}")


encoding count : Country
encoding count : VFN
encoding OHE : Mp
encoding count : Mh
encoding count : Man
encoding count : Tan
encoding count : T
encoding count : Va
encoding count : Ve
encoding count : Mk
encoding count : Cn
encoding OHE : Ct
encoding OHE : Cr
encoding OHE : Ft
encoding OHE : Fm
encoding count : IT


Full Count encoder

In [10]:
encoders={}
def count_encoder(col, df, train=True):
    if train:
        encoders[col]=CountEncoder(handle_unknown='value')
        df[col]=encoders[col].fit_transform(df[[col]])
    else:
        df[col]=encoders[col].transform(df[[col]])
    pass

for col in col_categoricals:
    count_encoder(col,data_train)
    count_encoder(col,data_test,train=False)
    print(f"encoding count : {col}")

encoding count : Country
encoding count : VFN
encoding count : Mp
encoding count : Mh
encoding count : Man
encoding count : Tan
encoding count : T
encoding count : Va
encoding count : Ve
encoding count : Mk
encoding count : Cn
encoding count : Ct
encoding count : Cr
encoding count : Ft
encoding count : Fm
encoding count : IT


Count + ordinal encoder

In [10]:
encoders={}

def count_encoder(col, df, train=True):
    if train:
        encoders[col]=CountEncoder(handle_unknown='value')
        df[col]=encoders[col].fit_transform(df[[col]])
    else:
        df[col]=encoders[col].transform(df[[col]])
    pass

def ordinal_encoder(colname:str,data:pd.DataFrame,train=True):
    if train:
        encoders[colname]=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        data[colname]=encoders[colname].fit_transform(data[[colname]])
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
    pass

for col in col_categoricals:
    if nombre_val_unique[col]>=15: #eventually replace by catboost encoder but careful cuz of Target ! (do TTS first)
        count_encoder(col,data_train)
        count_encoder(col,data_test,False)
        print(f"encoding count : {col}")
    else:
        ordinal_encoder(col,data_train) #reassign cuz you don't know how to do it...
        ordinal_encoder(col,data_test, False)
        print(f"encoding ordinal : {col}")


encoding count : Country
encoding count : VFN
encoding ordinal : Mp
encoding count : Mh
encoding count : Man
encoding count : Tan
encoding count : T
encoding count : Va
encoding count : Ve
encoding count : Mk
encoding count : Cn
encoding ordinal : Ct
encoding ordinal : Cr
encoding ordinal : Ft
encoding ordinal : Fm
encoding count : IT


Label/Ordinal encoding

In [10]:
encoders={}
def ordinal_encoder(colname:str,data:pd.DataFrame,train=True):
    if train:
        encoders[colname]=OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        data[colname]=encoders[colname].fit_transform(data[[colname]])
    else:
        data[colname]=encoders[colname].transform(data[[colname]])


for col in col_categoricals:
    ordinal_encoder(col,data_train)
    ordinal_encoder(col,data_test,train=False)

Target Encoding

In [10]:
encoders={}
def target_encoder(colname:str,data:pd.DataFrame,train=True):
    if train:
        encoders[colname]=TargetEncoder(target_type='continuous', smooth='auto',random_state=42)
        data[colname]=encoders[colname].fit_transform(data[[colname]],data['Ewltp (g/km)'])
        pass
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
        pass

for col in col_categoricals:
    target_encoder(col,data_train)
    target_encoder(col,data_test,False)

Impact Encoding

Proposé par Sam B. J'ai utilisé Chat GPT pour l'implémenter honnêtement.

In [10]:

train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

encoders={}
def impact_encoder(colname:str,data:pd.DataFrame,target=None,train=True):
    if train:
        encoders[colname]=LeaveOneOutEncoder(handle_unknown="value")
        data[colname]=encoders[colname].fit_transform(data[[colname]],target)
        pass
    else:
        data[colname]=encoders[colname].transform(data[[colname]])
        pass

for col in col_categoricals:
    impact_encoder(col,X_train,target=y_train)
    impact_encoder(col,X_test,train=False)
    impact_encoder(col,data_test,train=False)

## Feature Engineering

Doit-on utiliser ces variables ?

In [9]:
def create_conforme(df):
    df['conforme'] = df['Tan'].isna()
    df['conforme'] = df['conforme'].apply(lambda x: 1 if x==False else 0)
    pass

create_conforme(data_train)
create_conforme(data_test)


In [12]:
def compute_surface(obs):
    max_largeur= max(obs['At1 (mm)'], obs['At2 (mm)'])
    return obs['W (mm)']*obs['At1 (mm)'] if max_largeur == obs['At1 (mm)'] else obs['W (mm)'] * obs['At2 (mm)']

def create_surface(df):
    df['surface']= df.apply(compute_surface, axis=1)
    pass

create_surface(data_train)
create_surface(data_test)


## Split 

In [11]:
train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

# Model Testing

## Random Forest 1

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering.

Computation time: 41min

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.8986731992878796


In [13]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("data/new_simple_rf.csv", index=False)

In [15]:
joblib.dump(random_forest, 'models/random_forest_simple_model.joblib')

['random_forest_simple_model.joblib']

## Random Forest 2

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

**target** encoding. Strandard fill NaN. Random forest. No feature Engineering.

Computation time: 23min (Home)

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.986984486605863


In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_target_rf.csv", index=False)

## Random Forest 3

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

**impact** encoding. Strandard fill NaN. Random forest. No feature Engineering.

Computation time: 22 (Home)

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 26.539493069959914


In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_impact_rf.csv", index=False)

## Random Forest 4

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **use mean of Y**

Computation time: 1.33min (Home)

In [13]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.6239323258632914


In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_mean_rf.csv", index=False)

## Random Forest 5

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **feature engineering variables (conformes, surface)**

Computation time: 23min (Home)

In [14]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.8998117328283994


In [15]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_fe_rf.csv", index=False)

## Random Forest 6

Récupération des NaN avec les colonnes électriques etc. 

ordinal encoder, outliers winsorize, median/mode impute. no feature engineering.

computing time : 23(Home)

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.8953469493264583


In [13]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_all_col_rf.csv", index=False)

## Random Forest 7

Récupération des NaN avec les colonnes électriques etc. 

ordinal encoder, outliers **median**, median/mode impute. no feature engineering.

computing time : 24min(Home)

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.8953469493264583


In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_all_col_outliers_med_rf.csv", index=False)

## Random Forest 8



In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1,criterion="absolute_error")

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

KeyboardInterrupt: 

In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_abs_err_rf.csv", index=False)

## Bagging 1
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. estimator = DecistionTreeRegressor

Computation time: 2,14min (Home)

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

  warn(


Mean Absolute Error (MAE): 2.929723205821804


In [16]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_tree_bagging.csv", index=False)

In [17]:
bag.get_params()

{'base_estimator': 'deprecated',
 'bootstrap': True,
 'bootstrap_features': False,
 'estimator__ccp_alpha': 0.0,
 'estimator__criterion': 'squared_error',
 'estimator__max_depth': None,
 'estimator__max_features': None,
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__random_state': None,
 'estimator__splitter': 'best',
 'estimator': DecisionTreeRegressor(),
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': -1,
 'oob_score': True,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

## Bagging 2
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 20**

Computation time: 5min (Home)

In [13]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42,verbose=3,n_estimators=20)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:  4.4min remaining: 39.8min
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:  4.5min remaining:  5.5min
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:  4.5min remaining:  1.1min
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:  4.5min finished
  warn(
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    5.4s remaining:   49.1s
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:    5.9s remaining:    7.2s
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:    6.3s remaining:    1.5s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    6.5s finished


Mean Absolute Error (MAE): 2.9137636972220546


In [14]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_20_estim_bagging.csv", index=False)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    1.8s remaining:   17.0s
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:    2.6s remaining:    3.3s
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:    3.1s remaining:    0.7s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    3.3s finished


## Bagging 3
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 30**

Computation time: 7min (Home)

In [26]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42,verbose=3,n_estimators= 30)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:  5.3min remaining: 26.7min
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:  5.5min remaining:  4.7min
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:  6.7min remaining:   36.2s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:  6.7min finished
  warn(
[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:    7.2s remaining:   36.2s
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:    8.2s remaining:    6.9s
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:    9.8s remaining:    0.8s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:    9.9s finished


Mean Absolute Error (MAE): 2.9082315288155627


In [27]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_30_estim_bagging.csv", index=False)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:    3.5s remaining:   18.1s
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:    4.3s remaining:    3.6s
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:    4.8s remaining:    0.3s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:    4.9s finished


## Bagging 4
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 50**

Computation time: 13min (Home)

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42,verbose=3,n_estimators= 50)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed: 10.8min remaining: 53.9min
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed: 11.0min remaining:  9.3min
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed: 11.2min remaining:  1.0min
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed: 11.8min finished
[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:   12.2s remaining:  1.0min
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:   14.0s remaining:   11.9s
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:   15.6s remaining:    1.3s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:   15.7s finished


Mean Absolute Error (MAE): 2.9028932743761215


In [13]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_50_estim_bagging.csv", index=False)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:    5.7s remaining:   28.8s
[Parallel(n_jobs=24)]: Done  13 out of  24 | elapsed:    7.5s remaining:    6.4s
[Parallel(n_jobs=24)]: Done  22 out of  24 | elapsed:    8.8s remaining:    0.7s
[Parallel(n_jobs=24)]: Done  24 out of  24 | elapsed:    9.0s finished


## Bagging 5
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 10, bootstrap_features= True**

Computation time: 2,15min (Home)

In [14]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42,verbose=3,n_estimators= 10,bootstrap_features=True)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:  1.7min remaining:  4.0min
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:  1.8min remaining:   46.2s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:  2.0min finished
  warn(
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    2.9s remaining:    6.9s
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:    3.2s remaining:    1.3s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    3.3s finished


Mean Absolute Error (MAE): 2.94623303681266


## Bagging 6
Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,StatusEnedc (g/km),z (Wh/km),Electric range (km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 20, oob_score=False**

Computation time: 4.35min (Home)

In [16]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=False, random_state=42,verbose=3,n_estimators= 20)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:  4.4min remaining: 39.3min
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:  4.4min remaining:  5.4min
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:  4.4min remaining:  1.1min
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:  4.5min finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    5.5s remaining:   50.5s
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:    5.9s remaining:    7.2s
[Parallel(n_jobs=20)]: Done  16 out of  20 | elapsed:    6.4s remaining:    1.5s
[Parallel(n_jobs=20)]: Done  20 out of  20 | elapsed:    6.7s finished


Mean Absolute Error (MAE): 2.9137636972220546


## Bagging 7

Use all columns except those dropped (MMS, r, Ernedc (g/km), De,Vf,Status,Enedc (g/km),Date of registration). 

ordinal encoding. Strandard fill NaN. Random forest. No feature Engineering. **n_estimator = 100, oob_score=True**

Computation time:  27min (Home)

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True, random_state=42,verbose=100,n_estimators= 100)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   1 tasks      | elapsed: 22.7min
[Parallel(n_jobs=24)]: Done   2 out of  24 | elapsed: 22.7min remaining: 249.9min
[Parallel(n_jobs=24)]: Done   3 out of  24 | elapsed: 22.8min remaining: 159.4min
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed: 22.8min remaining: 114.0min
[Parallel(n_jobs=24)]: Done   5 out of  24 | elapsed: 22.8min remaining: 86.8min
[Parallel(n_jobs=24)]: Done   6 out of  24 | elapsed: 22.9min remaining: 68.7min
[Parallel(n_jobs=24)]: Done   7 out of  24 | elapsed: 23.0min remaining: 55.9min
[Parallel(n_jobs=24)]: Done   8 out of  24 | elapsed: 23.1min remaining: 46.3min
[Parallel(n_jobs=24)]: Done   9 out of  24 | elapsed: 23.2min remaining: 38.6min
[Parallel(n_jobs=24)]: Done  10 out of  24 | elapsed: 23.4min remaining: 32.8min
[Parallel(n_jobs=24)]: Done  11 out of  24 | elapsed: 23.5min remaining: 27.7min
[Parallel(n_jobs=24)]: Done  12 out of  24 | ela

In [13]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_100_estim_bagging.csv", index=False)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   1 tasks      | elapsed:    6.5s
[Parallel(n_jobs=24)]: Done   2 out of  24 | elapsed:    7.2s remaining:  1.3min
[Parallel(n_jobs=24)]: Done   3 out of  24 | elapsed:    8.6s remaining:  1.0min
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:    9.4s remaining:   47.2s
[Parallel(n_jobs=24)]: Done   5 out of  24 | elapsed:    9.8s remaining:   37.5s
[Parallel(n_jobs=24)]: Done   6 out of  24 | elapsed:   10.6s remaining:   32.1s
[Parallel(n_jobs=24)]: Done   7 out of  24 | elapsed:   11.5s remaining:   28.1s
[Parallel(n_jobs=24)]: Done   8 out of  24 | elapsed:   12.3s remaining:   24.8s
[Parallel(n_jobs=24)]: Done   9 out of  24 | elapsed:   13.3s remaining:   22.2s
[Parallel(n_jobs=24)]: Done  10 out of  24 | elapsed:   13.7s remaining:   19.2s
[Parallel(n_jobs=24)]: Done  11 out of  24 | elapsed:   14.7s remaining:   17.4s
[Parallel(n_jobs=24)]: Done  12 out of  24 | elapse

## Bagging 8
Tests
- 'max_samples': 0.8 meilleurs résultats que default (1)

- 'max_features': 0.7 (0.8 et 0.5 baisse)
17 min(home)

In [14]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100,n_estimators= 100,
                      max_features=0.7, max_samples=0.8)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   1 tasks      | elapsed: 12.6min
[Parallel(n_jobs=24)]: Done   2 out of  24 | elapsed: 12.6min remaining: 138.6min
[Parallel(n_jobs=24)]: Done   3 out of  24 | elapsed: 13.2min remaining: 92.1min
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed: 13.4min remaining: 66.8min
[Parallel(n_jobs=24)]: Done   5 out of  24 | elapsed: 13.4min remaining: 50.8min
[Parallel(n_jobs=24)]: Done   6 out of  24 | elapsed: 13.5min remaining: 40.5min
[Parallel(n_jobs=24)]: Done   7 out of  24 | elapsed: 13.6min remaining: 33.0min
[Parallel(n_jobs=24)]: Done   8 out of  24 | elapsed: 13.6min remaining: 27.2min
[Parallel(n_jobs=24)]: Done   9 out of  24 | elapsed: 13.8min remaining: 23.1min
[Parallel(n_jobs=24)]: Done  10 out of  24 | elapsed: 13.9min remaining: 19.5min
[Parallel(n_jobs=24)]: Done  11 out of  24 | elapsed: 13.9min remaining: 16.5min
[Parallel(n_jobs=24)]: Done  12 out of  24 | elaps

In [15]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_100_07_08_bagging.csv", index=False)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   1 tasks      | elapsed:    5.6s
[Parallel(n_jobs=24)]: Done   2 out of  24 | elapsed:    5.6s remaining:  1.1min
[Parallel(n_jobs=24)]: Done   3 out of  24 | elapsed:    6.4s remaining:   45.7s
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:    6.9s remaining:   34.8s
[Parallel(n_jobs=24)]: Done   5 out of  24 | elapsed:    7.5s remaining:   28.9s
[Parallel(n_jobs=24)]: Done   6 out of  24 | elapsed:    7.7s remaining:   23.2s
[Parallel(n_jobs=24)]: Done   7 out of  24 | elapsed:    8.0s remaining:   19.5s
[Parallel(n_jobs=24)]: Done   8 out of  24 | elapsed:    8.4s remaining:   16.9s
[Parallel(n_jobs=24)]: Done   9 out of  24 | elapsed:    8.5s remaining:   14.2s
[Parallel(n_jobs=24)]: Done  10 out of  24 | elapsed:    8.7s remaining:   12.3s
[Parallel(n_jobs=24)]: Done  11 out of  24 | elapsed:    8.8s remaining:   10.4s
[Parallel(n_jobs=24)]: Done  12 out of  24 | elapse

## Bagging 9
**n_estimators = 50**
- 'max_samples': 0.8 meilleurs résultats que default (1)

- 'max_features': 0.7 (0.8 et 0.5 baisse)

In [16]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=-1,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100,n_estimators= 50,
                      max_features=0.7, max_samples=0.8)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=24)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   1 tasks      | elapsed:  6.5min
[Parallel(n_jobs=24)]: Done   2 out of  24 | elapsed:  6.5min remaining: 71.6min
[Parallel(n_jobs=24)]: Done   3 out of  24 | elapsed:  6.5min remaining: 45.8min
[Parallel(n_jobs=24)]: Done   4 out of  24 | elapsed:  6.6min remaining: 32.8min
[Parallel(n_jobs=24)]: Done   5 out of  24 | elapsed:  6.6min remaining: 25.1min
[Parallel(n_jobs=24)]: Done   6 out of  24 | elapsed:  6.6min remaining: 19.8min
[Parallel(n_jobs=24)]: Done   7 out of  24 | elapsed:  6.6min remaining: 16.1min
[Parallel(n_jobs=24)]: Done   8 out of  24 | elapsed:  6.7min remaining: 13.3min
[Parallel(n_jobs=24)]: Done   9 out of  24 | elapsed:  6.7min remaining: 11.1min
[Parallel(n_jobs=24)]: Done  10 out of  24 | elapsed:  6.7min remaining:  9.4min
[Parallel(n_jobs=24)]: Done  11 out of  24 | elapsed:  6.7min remaining:  8.0min
[Parallel(n_jobs=24)]: Done  12 out of  24 | elapse

## Bagging 10

- n_estimators = 100
- 'max_samples': 0.7 meilleurs résultats que default (1)

- 'max_features': 0.8 (0.8 et 0.5 baisse)

computing time : 18min

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=20,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100, max_features=0.8, max_samples=0.7,n_estimators=100)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   1 tasks      | elapsed: 14.5min
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed: 14.8min remaining: 133.4min
[Parallel(n_jobs=20)]: Done   3 out of  20 | elapsed: 14.9min remaining: 84.2min
[Parallel(n_jobs=20)]: Done   4 out of  20 | elapsed: 14.9min remaining: 59.6min
[Parallel(n_jobs=20)]: Done   5 out of  20 | elapsed: 14.9min remaining: 44.8min
[Parallel(n_jobs=20)]: Done   6 out of  20 | elapsed: 15.0min remaining: 34.9min
[Parallel(n_jobs=20)]: Done   7 out of  20 | elapsed: 15.0min remaining: 27.9min
[Parallel(n_jobs=20)]: Done   8 out of  20 | elapsed: 15.0min remaining: 22.6min
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed: 15.1min remaining: 18.4min
[Parallel(n_jobs=20)]: Done  10 out of  20 | elapsed: 15.1min remaining: 15.1min
[Parallel(n_jobs=20)]: Done  11 out of  20 | elapsed: 15.1min remaining: 12.4min
[Parallel(n_jobs=20)]: Done  12 out of  20 | elaps

In [13]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_100_08_07_bagging.csv", index=False)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   1 tasks      | elapsed:    7.2s
[Parallel(n_jobs=20)]: Done   2 out of  20 | elapsed:    8.2s remaining:  1.2min
[Parallel(n_jobs=20)]: Done   3 out of  20 | elapsed:    8.4s remaining:   47.9s
[Parallel(n_jobs=20)]: Done   4 out of  20 | elapsed:    9.3s remaining:   37.6s
[Parallel(n_jobs=20)]: Done   5 out of  20 | elapsed:   10.1s remaining:   30.5s
[Parallel(n_jobs=20)]: Done   6 out of  20 | elapsed:   10.1s remaining:   23.8s
[Parallel(n_jobs=20)]: Done   7 out of  20 | elapsed:   10.5s remaining:   19.6s
[Parallel(n_jobs=20)]: Done   8 out of  20 | elapsed:   10.8s remaining:   16.3s
[Parallel(n_jobs=20)]: Done   9 out of  20 | elapsed:   11.5s remaining:   14.0s
[Parallel(n_jobs=20)]: Done  10 out of  20 | elapsed:   11.7s remaining:   11.7s
[Parallel(n_jobs=20)]: Done  11 out of  20 | elapsed:   12.3s remaining:   10.1s
[Parallel(n_jobs=20)]: Done  12 out of  20 | elapse

## Bagging 11

- n_estimators = 100
- max_features=**0.9**
- max_samples=**0.9**

computing time: 

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=15,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100, max_features=0.9, max_samples=0.9, n_estimators=100)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done   1 tasks      | elapsed: 16.5min
[Parallel(n_jobs=15)]: Done   2 out of  15 | elapsed: 16.6min remaining: 107.8min
[Parallel(n_jobs=15)]: Done   3 out of  15 | elapsed: 16.6min remaining: 66.5min
[Parallel(n_jobs=15)]: Done   4 out of  15 | elapsed: 16.8min remaining: 46.1min
[Parallel(n_jobs=15)]: Done   5 out of  15 | elapsed: 16.9min remaining: 33.7min
[Parallel(n_jobs=15)]: Done   6 out of  15 | elapsed: 18.2min remaining: 27.3min
[Parallel(n_jobs=15)]: Done   7 out of  15 | elapsed: 18.2min remaining: 20.8min
[Parallel(n_jobs=15)]: Done   8 out of  15 | elapsed: 18.2min remaining: 16.0min
[Parallel(n_jobs=15)]: Done   9 out of  15 | elapsed: 18.3min remaining: 12.2min
[Parallel(n_jobs=15)]: Done  10 out of  15 | elapsed: 18.3min remaining:  9.1min
[Parallel(n_jobs=15)]: Done  11 out of  15 | elapsed: 18.3min remaining:  6.7min
[Parallel(n_jobs=15)]: Done  12 out of  15 | elaps

In [13]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_100_09_09_bagging.csv", index=False)

[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done   1 tasks      | elapsed:    9.7s
[Parallel(n_jobs=15)]: Done   2 out of  15 | elapsed:   11.5s remaining:  1.3min
[Parallel(n_jobs=15)]: Done   3 out of  15 | elapsed:   11.9s remaining:   48.0s
[Parallel(n_jobs=15)]: Done   4 out of  15 | elapsed:   13.1s remaining:   36.3s
[Parallel(n_jobs=15)]: Done   5 out of  15 | elapsed:   13.3s remaining:   26.8s
[Parallel(n_jobs=15)]: Done   6 out of  15 | elapsed:   13.9s remaining:   20.9s
[Parallel(n_jobs=15)]: Done   7 out of  15 | elapsed:   14.0s remaining:   16.0s
[Parallel(n_jobs=15)]: Done   8 out of  15 | elapsed:   15.2s remaining:   13.3s
[Parallel(n_jobs=15)]: Done   9 out of  15 | elapsed:   15.6s remaining:   10.4s
[Parallel(n_jobs=15)]: Done  10 out of  15 | elapsed:   15.8s remaining:    7.8s
[Parallel(n_jobs=15)]: Done  11 out of  15 | elapsed:   15.9s remaining:    5.7s
[Parallel(n_jobs=15)]: Done  12 out of  15 | elapse

## Bagging Blind

9min (pc) 

MAE Kaggle: 2.85

In [11]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=4,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100, max_features=0.9, max_samples=0.9)

bag.fit(data_train.drop(columns='Ewltp (g/km)'), data_train['Ewltp (g/km)'])

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  7.1min
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  7.2min remaining:  7.2min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  8.7min finished


  warn(


In [12]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("data/new_100_09_09_blind_bagging.csv", index=False)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   10.0s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   11.0s remaining:   11.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.8s finished


## Bagging Blind full

With 100 estimators. **MAE Kaggle 2.8320**

26min(home)

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=8,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100, max_features=0.9, max_samples=0.9, n_estimators=100)

bag.fit(data_train.drop(columns='Ewltp (g/km)'), data_train['Ewltp (g/km)'])

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_100_09_09_blind_cstm_encoder_bagging_full.csv", index=False)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:   11.3s
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:   15.8s remaining:  1.3min
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:   16.9s remaining:   50.8s
[Parallel(n_jobs=12)]: Done   4 out of  12 | elapsed:   17.2s remaining:   34.4s
[Parallel(n_jobs=12)]: Done   5 out of  12 | elapsed:   17.3s remaining:   24.2s
[Parallel(n_jobs=12)]: Done   6 out of  12 | elapsed:   17.6s remaining:   17.6s
[Parallel(n_jobs=12)]: Done   7 out of  12 | elapsed:   19.0s remaining:   13.5s
[Parallel(n_jobs=12)]: Done   8 out of  12 | elapsed:   19.1s remaining:    9.5s
[Parallel(n_jobs=12)]: Done   9 out of  12 | elapsed:   20.0s remaining:    6.6s
[Parallel(n_jobs=12)]: Done  10 out of  12 | elapsed:   21.0s remaining:    4.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:   22.5s finished


## Bagging Test

normal preprocessing VS Mean_group_imputer preprocessing. 

MAE 2.88 vs 38.52 

that's shit


In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=15,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100, max_features=0.9, max_samples=0.9)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:  1.6min
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:  1.7min remaining:  6.7min
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:  1.7min remaining:  3.9min
[Parallel(n_jobs=10)]: Done   4 out of  10 | elapsed:  1.7min remaining:  2.5min
[Parallel(n_jobs=10)]: Done   5 out of  10 | elapsed:  1.7min remaining:  1.7min
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:  1.7min remaining:  1.1min
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:  1.7min remaining:   43.8s
[Parallel(n_jobs=10)]: Done   8 out of  10 | elapsed:  1.7min remaining:   25.6s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:  1.8min finished


  warn(


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:    2.5s
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    2.6s remaining:   10.6s
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    2.9s remaining:    6.9s
[Parallel(n_jobs=10)]: Done   4 out of  10 | elapsed:    3.0s remaining:    4.6s
[Parallel(n_jobs=10)]: Done   5 out of  10 | elapsed:    3.2s remaining:    3.2s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    3.2s remaining:    2.1s
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:    3.3s remaining:    1.3s
[Parallel(n_jobs=10)]: Done   8 out of  10 | elapsed:    3.3s remaining:    0.8s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    3.5s finished
Mean Absolute Error (MAE): 2.8898037952154683


In [37]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=15,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100, max_features=0.9, max_samples=0.9)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:   19.1s
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:   27.9s remaining:  1.9min
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   29.0s remaining:  1.1min
[Parallel(n_jobs=10)]: Done   4 out of  10 | elapsed:   29.2s remaining:   43.9s
[Parallel(n_jobs=10)]: Done   5 out of  10 | elapsed:   33.1s remaining:   33.1s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   37.7s remaining:   25.1s
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:   39.7s remaining:   16.9s
[Parallel(n_jobs=10)]: Done   8 out of  10 | elapsed:   53.0s remaining:   13.2s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:  1.0min finished


  warn(


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:    1.0s
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:    1.1s remaining:    4.5s
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    1.1s remaining:    2.7s
[Parallel(n_jobs=10)]: Done   4 out of  10 | elapsed:    1.1s remaining:    1.7s
[Parallel(n_jobs=10)]: Done   5 out of  10 | elapsed:    1.2s remaining:    1.2s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:    1.2s remaining:    0.8s
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:    1.2s remaining:    0.5s
[Parallel(n_jobs=10)]: Done   8 out of  10 | elapsed:    1.2s remaining:    0.2s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    1.2s finished
Mean Absolute Error (MAE): 38.52301057321326


## Bagging 12

Test with newly encoded columns: count encoder + OHE

- n_estimators = 100
- max_features=0.9
- max_samples=0.9
- no n_jobs specified

computing time: 69min(home)

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),bootstrap=True,oob_score=True,
                      random_state=42,verbose=100, max_features=0.9, max_samples=0.9, n_estimators=100)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Building estimator 1 of 100 for this parallel run (total 100)...
Building estimator 2 of 100 for this parallel run (total 100)...
Building estimator 3 of 100 for this parallel run (total 100)...
Building estimator 4 of 100 for this parallel run (total 100)...
Building estimator 5 of 100 for this parallel run (total 100)...
Building estimator 6 of 100 for this parallel run (total 100)...
Building estimator 7 of 100 for this parallel run (total 100)...
Building estimator 8 of 100 for this parallel run (total 100)...
Building estimator 9 of 100 for this parallel run (total 100)...
Building estimator 10 of 100 for this parallel run (total 100)...
Building estimator 11 of 100 for this parallel run (total 100)...
Building estimator 12 of 100 for this parallel run (total 100)...
Building estimator 13 of 100 for this parallel run (total 100)...
Building estimator 14 of 100 for this parallel run (total 100)...
Building estimator 15 of 100 for this parallel run (total 100)...
Building estimator 

In [13]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_100_09_09_cstm_encoding_bagging.csv", index=False)

[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  1.0min
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:  1.0min


## Bagging 13

**Full count encoder**, same param as bagging 11

n_jobs= 10

computing time: 19min(home)

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=10,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100, max_features=0.9, max_samples=0.9, n_estimators=100)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed: 15.7min
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed: 15.7min remaining: 62.9min
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed: 15.7min remaining: 36.7min
[Parallel(n_jobs=10)]: Done   4 out of  10 | elapsed: 15.8min remaining: 23.7min
[Parallel(n_jobs=10)]: Done   5 out of  10 | elapsed: 15.9min remaining: 15.9min
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed: 15.9min remaining: 10.6min
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed: 16.0min remaining:  6.9min
[Parallel(n_jobs=10)]: Done   8 out of  10 | elapsed: 16.0min remaining:  4.0min
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed: 16.1min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:   28.2s
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:   30.4s remaining:  2.0min
[Pa

In [13]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_100_09_09_count_encod_bagging.csv", index=False)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:   14.1s
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:   15.3s remaining:  1.0min
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   15.7s remaining:   36.9s
[Parallel(n_jobs=10)]: Done   4 out of  10 | elapsed:   16.0s remaining:   24.1s
[Parallel(n_jobs=10)]: Done   5 out of  10 | elapsed:   16.5s remaining:   16.5s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   17.6s remaining:   11.7s
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:   17.9s remaining:    7.6s
[Parallel(n_jobs=10)]: Done   8 out of  10 | elapsed:   18.5s remaining:    4.6s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   19.5s finished


## Bagging 14

**count + ordinal encoder**, same param as bagging 11

n_jobs= 10

computing time: 19min(home)

In [12]:
bag=BaggingRegressor(estimator=DecisionTreeRegressor(),n_jobs=10,bootstrap=True,oob_score=True,
                      random_state=42,verbose=100, max_features=0.9, max_samples=0.9, n_estimators=100)

bag.fit(X_train, y_train)

y_pred = bag.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed: 15.4min
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed: 15.5min remaining: 61.9min
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed: 15.5min remaining: 36.3min
[Parallel(n_jobs=10)]: Done   4 out of  10 | elapsed: 15.7min remaining: 23.5min
[Parallel(n_jobs=10)]: Done   5 out of  10 | elapsed: 15.7min remaining: 15.7min
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed: 15.8min remaining: 10.5min
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed: 15.8min remaining:  6.8min
[Parallel(n_jobs=10)]: Done   8 out of  10 | elapsed: 15.8min remaining:  4.0min
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed: 15.8min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:   28.4s
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:   30.4s remaining:  2.0min
[Pa

In [13]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_100_09_09_count_ordinal_encod_bagging.csv", index=False)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   1 tasks      | elapsed:   14.0s
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:   15.1s remaining:  1.0min
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:   15.7s remaining:   36.7s
[Parallel(n_jobs=10)]: Done   4 out of  10 | elapsed:   15.9s remaining:   23.9s
[Parallel(n_jobs=10)]: Done   5 out of  10 | elapsed:   16.8s remaining:   16.8s
[Parallel(n_jobs=10)]: Done   6 out of  10 | elapsed:   17.7s remaining:   11.8s
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:   17.7s remaining:    7.5s
[Parallel(n_jobs=10)]: Done   8 out of  10 | elapsed:   18.3s remaining:    4.5s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   19.4s finished


## Adaboost

In [21]:
boosting = AdaBoostRegressor(random_state=42,loss='exponential')

boosting.fit(X_train, y_train)

y_pred = boosting.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 56.538170887529546


In [None]:
data_test["Ewltp (g/km)"] = bag.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_simple_boosting.csv", index=False)

## Decision Tree Grid Search

In [12]:
decision_tree_model = DecisionTreeRegressor()

param_grid = {
    'criterion': ['absolute_error', 'poisson', 'squared_error'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]}

grid_search = GridSearchCV(decision_tree_model, param_grid, cv=5, scoring='neg_mean_absolute_error',verbose=3)

grid_search.fit(X_train, y_train)
print("Meilleurs hyperparamètres:", grid_search.best_params_)

y_pred = grid_search.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error sur les données de test:", mae)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


## Baggin Grid Search

In [None]:
base_model_params = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    
}

# Définir les hyperparamètres spécifiques au Bagging
bagging_params = {
    'n_estimators': [20,50,100],
    'max_samples': [1.0, 0.8, 0.9],  # La fraction d'échantillons à utiliser pour chaque sac
    'bootstrap': [True, False]
}

# Créer un modèle DecisionTreeRegressor pour être utilisé comme modèle de base
base_model = DecisionTreeRegressor(random_state=42)

# Créer un modèle BaggingRegressor
bagging_model = BaggingRegressor(base_model, random_state=42)

# Combiner les deux ensembles de paramètres
param_grid = {**base_model_params, **bagging_params}

# Utiliser GridSearchCV
grid_search = GridSearchCV(
    estimator=bagging_model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Utilisez 'neg_mean_absolute_error' pour la régression
    cv=3,
    verbose=3)

grid_search.fit(X_train, y_train)
print("Meilleurs paramètres : ", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE) sur les données de test : ", mae)

## Xgboost 1

- depth:30
- 'gamma':10,
- 'learning_rate': 0.05,
- 'eval_metric': 'rmse',
- 'early_stopping_rounds':50,
- 'n_jobs':-1

5min

In [16]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror', 
    'max_depth': 30,
    'gamma':10,
    'learning_rate': 0.05,
    'eval_metric': 'rmse',
    'n_jobs':-1
}

num_rounds = 4000
model = xgb.train(params, dtrain, num_rounds)

y_pred = model.predict(dtest)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


Mean Absolute Error (MAE): 2.882604650941162


In [14]:
testing= xgb.DMatrix(data_test.drop(columns='ID'))
data_test["Ewltp (g/km)"] = model.predict(testing)
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_xgb.csv", index=False)

## Xgboost 2

- 'n_estimators':4000,
- 'max_depth': 35,
- 'learning_rate': 0.005,
- 'colsample_bytree':0.80,
- 'gamma':10,
- 'reg_alpha':0.8,
- 'reg_lambda':0.1,
- 'objective': 'reg:squarederror',
- 'tree_method':'hist',
- 'n_jobs':-1

computing time: 25min (home)

In [23]:
params = {'n_estimators':4000,
          'max_depth': 35,
          'learning_rate': 0.005,
          'colsample_bytree':0.80,
          'gamma':10,
          'reg_alpha':0.8,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1
          
}

model = xgb.XGBRegressor(**params)

model.fit(X_train, y_train,verbose=True)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


Mean Absolute Error (MAE): 2.8647867328354377


In [17]:
data_test["Ewltp (g/km)"] = model.predict(data_test.drop(columns="ID"))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_yao_xgb.csv", index=False)

## Xgboost blind full

- 'n_estimators':4000,
- 'max_depth': 35,
- 'learning_rate': 0.005,
- 'colsample_bytree':0.80,
- 'gamma':10,
- 'reg_alpha':0.8,
- 'reg_lambda':0.1,
- 'objective': 'reg:squarederror',
- 'tree_method':'hist',
- 'n_jobs':-1

with full dataset

computing time : 32min(home)

MAE Kaggle: 2.8459. Revoir le preprocessing

In [12]:
params = {'n_estimators':4000,
          'max_depth': 35,
          'learning_rate': 0.005,
          'colsample_bytree':0.80,
          'gamma':10,
          'reg_alpha':0.8,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1
}

model = xgb.XGBRegressor(**params)

model.fit(data_train.drop(columns='Ewltp (g/km)'), data_train['Ewltp (g/km)'])


In [14]:
data_test["Ewltp (g/km)"] = model.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_blind_yao_xgb_full.csv", index=False)

## Xgboost 3
With new encoding
- 'n_estimators':4000,
- 'max_depth': 35,
- 'learning_rate': 0.005,
- 'colsample_bytree':0.80,
- 'gamma':10,
- 'reg_alpha':0.8,
- 'reg_lambda':0.1,
- 'objective': 'reg:squarederror',
- 'tree_method':'hist',
- 'n_jobs':-1

computing time: 31min (home)

In [12]:
params = {'n_estimators':4000,
          'max_depth': 35,
          'learning_rate': 0.005,
          'colsample_bytree':0.80,
          'gamma':10,
          'reg_alpha':0.8,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1
}

model = xgb.XGBRegressor(**params)

model.fit(X_train, y_train,verbose=True)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.8182787570889554


In [13]:
data_test["Ewltp (g/km)"] = model.predict(data_test.drop(columns="ID"))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_yao_xgb_cstm_encoded.csv", index=False)

## Xgboost 4

With new encoding (encoding counter + OHE)
- 'n_estimators':4000,
- 'max_depth': 35,
- 'learning_rate': 0.005,
- 'colsample_bytree':1,
- 'subsample': 1,
- 'gamma':30,
- 'reg_alpha':0.71111,
- 'reg_lambda':0.1,
- 'objective': 'reg:squarederror',
- 'tree_method': 'hist',
- 'n_jobs':-1

Computing time:  24min(home)

In [12]:
params = {'n_estimators':4000,
          'max_depth': 35,
          'learning_rate': 0.005,
          'colsample_bytree':1,
          'subsample': 1,
          'gamma':30,
          'reg_alpha':0.71111,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1
}

model = xgb.XGBRegressor(**params)

model.fit(X_train, y_train,verbose=True)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.846653662793189


In [None]:
data_test["Ewltp (g/km)"] = model.predict(data_test.drop(columns="ID"))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_fine_tuned_xgb_cstm_encoded.csv", index=False)

## Xgboost 5 
With new encoding (encoding counter + OHE)
- 'n_estimators':4000,
- 'max_depth': 35,
- 'learning_rate':**1**,
- 'colsample_bytree':1,
- 'subsample': 1,
- 'gamma':30,
- 'reg_alpha':0.71111,
- 'reg_lambda':0.1,
- 'objective': 'reg:squarederror',
- 'tree_method': 'hist',
- 'n_jobs':-1

Computing time:  3min(home)

In [12]:
params = {'n_estimators':4000,
          'max_depth': 35,
          'learning_rate': 1,
          'colsample_bytree':1,
          'subsample': 1,
          'gamma':30,
          'reg_alpha':0.71111,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1
}

model = xgb.XGBRegressor(**params)

model.fit(X_train, y_train,verbose=True)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.9375179938868228


In [None]:
data_test["Ewltp (g/km)"] = model.predict(data_test.drop(columns="ID"))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_lr_1_xgb_cstm_encoded.csv", index=False)

## Xgboost 6
With new encoding (encoding counter + OHE)
- 'n_estimators':**5000**,
- 'max_depth': 35,
- 'learning_rate':**0.001**,
- 'colsample_bytree':1,
- 'subsample': 1,
- 'gamma':30,
- 'reg_alpha':0.71111,
- 'reg_lambda':0.1,
- 'objective': 'reg:squarederror',
- 'tree_method': 'hist',
- 'n_jobs':-1

Computing time:  93min(home)

In [13]:
params = {'n_estimators':5000,
          'max_depth': 35,
          'learning_rate': 0.001,
          'colsample_bytree':1,
          'subsample': 1,
          'gamma':30,
          'reg_alpha':0.71111,
          'reg_lambda':0.1,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'n_jobs':-1
}

model = xgb.XGBRegressor(**params)

model.fit(X_train, y_train,verbose=True)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.023181980018556


In [None]:
data_test["Ewltp (g/km)"] = model.predict(data_test.drop(columns="ID"))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_lr_0001_nEst_5000_xgb_cstm_encoded.csv", index=False)

## Catboost 1

computing time 49min

In [15]:

catb = CatBoostRegressor(iterations=10000,  # Nombre d'itérations (peut être ajusté)
                          depth=6,  # Profondeur de l'arbre (peut être ajusté)
                          learning_rate=0.1,  # Taux d'apprentissage (peut être ajusté)
                          loss_function='RMSE',  # Fonction de perte pour la régression
                          random_seed=42,
                          task_type='CPU',
                          thread_count=-1)

catb.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

predictions = catb.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

0:	learn: 166.0466744	test: 166.0927154	best: 166.0927154 (0)	total: 329ms	remaining: 54m 52s
100:	learn: 19.9021623	test: 19.8023282	best: 19.8023282 (100)	total: 30.2s	remaining: 49m 16s
200:	learn: 16.3895228	test: 16.3450710	best: 16.3450710 (200)	total: 60s	remaining: 48m 44s
300:	learn: 14.6005017	test: 14.5744841	best: 14.5744841 (300)	total: 1m 29s	remaining: 48m 6s
400:	learn: 13.3583934	test: 13.3527044	best: 13.3527044 (400)	total: 1m 58s	remaining: 47m 28s
500:	learn: 12.4735322	test: 12.4949177	best: 12.4949177 (500)	total: 2m 28s	remaining: 46m 51s
600:	learn: 11.8248216	test: 11.8637427	best: 11.8637427 (600)	total: 2m 57s	remaining: 46m 19s
700:	learn: 11.3208447	test: 11.3706236	best: 11.3706236 (700)	total: 3m 27s	remaining: 45m 52s
800:	learn: 10.8895149	test: 10.9501880	best: 10.9501880 (800)	total: 3m 57s	remaining: 45m 23s
900:	learn: 10.5598394	test: 10.6378579	best: 10.6378579 (900)	total: 4m 26s	remaining: 44m 53s
1000:	learn: 10.2236002	test: 10.3213196	best: 

In [17]:
data_test["Ewltp (g/km)"] = catb.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_simple_catboosting.csv", index=False)

## Catboost 2

different test:
- iterations=10000 (+ = mieux)
- depth=10,  (+= mieux) max = 16
- learning_rate=1  
- loss_function='MAE'

computing time: 72min(home)

In [12]:

catb = CatBoostRegressor(iterations=10000,  # Nombre d'itérations (peut être ajusté)
                          depth=10,  # Profondeur de l'arbre (peut être ajusté)
                          learning_rate=1,  # Taux d'apprentissage (peut être ajusté)
                          loss_function='MAE',  # Fonction de perte pour la régression
                          random_seed=42,
                          task_type='CPU',
                          thread_count=-1)

catb.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

predictions = catb.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

0:	learn: 28.6058121	test: 28.5322570	best: 28.5322570 (0)	total: 873ms	remaining: 2h 25m 25s
100:	learn: 5.8784600	test: 5.9230869	best: 5.9230869 (100)	total: 1m 13s	remaining: 1h 59m 39s
200:	learn: 4.9300804	test: 5.0235295	best: 5.0235295 (200)	total: 2m 25s	remaining: 1h 58m 25s
300:	learn: 4.4654082	test: 4.6011861	best: 4.6011861 (300)	total: 3m 38s	remaining: 1h 57m 15s
400:	learn: 4.1525710	test: 4.3238239	best: 4.3238239 (400)	total: 4m 50s	remaining: 1h 56m 3s
500:	learn: 3.9729140	test: 4.1695874	best: 4.1695874 (500)	total: 6m 4s	remaining: 1h 55m 3s
600:	learn: 3.8476238	test: 4.0645906	best: 4.0645906 (600)	total: 7m 16s	remaining: 1h 53m 54s
700:	learn: 3.7449758	test: 3.9826462	best: 3.9826462 (700)	total: 8m 29s	remaining: 1h 52m 36s
800:	learn: 3.6587264	test: 3.9120052	best: 3.9120052 (800)	total: 9m 42s	remaining: 1h 51m 25s
900:	learn: 3.5905199	test: 3.8547781	best: 3.8547781 (900)	total: 10m 54s	remaining: 1h 50m 14s
1000:	learn: 3.5346848	test: 3.8087109	best:

In [13]:
data_test["Ewltp (g/km)"] = catb.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_10k_8_1_MAE_catboosting.csv", index=False)

## Catboost 3

- iterations= 10000 
- depth= 16
- learning_rate=1  
- loss_function='RMSE' (mieux que MAE)

computing time: 109min(home)

catboost encoder

In [12]:

catb = CatBoostRegressor(iterations=10000,  # Nombre d'itérations (peut être ajusté)
                          depth=16,  # Profondeur de l'arbre (peut être ajusté)
                          learning_rate=1,  # Taux d'apprentissage (peut être ajusté)
                          loss_function='RMSE',  # Fonction de perte pour la régression
                          random_seed=42)

catb.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

predictions = catb.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error (MAE):", mae)

0:	learn: 26.5469423	test: 26.5609633	best: 26.5609633 (0)	total: 4.74s	remaining: 13h 10m 32s
100:	learn: 5.3983648	test: 6.9934249	best: 6.9934249 (100)	total: 8m	remaining: 13h 4m 54s
200:	learn: 4.7077175	test: 6.6169532	best: 6.6169532 (200)	total: 15m 46s	remaining: 12h 49m 24s
300:	learn: 4.4201075	test: 6.4774620	best: 6.4774620 (300)	total: 23m 34s	remaining: 12h 39m 52s
400:	learn: 4.2737071	test: 6.4174758	best: 6.4174758 (400)	total: 31m 21s	remaining: 12h 30m 36s
500:	learn: 4.1882630	test: 6.3826007	best: 6.3825403 (499)	total: 39m 9s	remaining: 12h 22m 33s
600:	learn: 4.1312714	test: 6.3621676	best: 6.3621676 (600)	total: 46m 57s	remaining: 12h 14m 19s
700:	learn: 4.0869902	test: 6.3518645	best: 6.3513306 (696)	total: 54m 46s	remaining: 12h 6m 30s
800:	learn: 4.0517728	test: 6.3425141	best: 6.3425141 (800)	total: 1h 2m 34s	remaining: 11h 58m 40s
900:	learn: 4.0276829	test: 6.3377529	best: 6.3377529 (900)	total: 1h 10m 21s	remaining: 11h 50m 29s
1000:	learn: 4.0089038	tes

In [14]:
data_test["Ewltp (g/km)"] = catb.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("results/new_10k_16_1_RMSE_catboosting.csv", index=False)