# Import + Preprocessing

In [1]:
from fonctions import Dataset
from src.preprocessing import display_missing_values

In [2]:
train=Dataset("data/train.csv")
data_train=train.load_data()
test=Dataset("data/test.csv")
data_test=test.load_data()

In [None]:
from fonctions import Preprocessor,TrainPreprocessor,TestPreprocessor
train_preprocessor=TrainPreprocessor(data_train)
test_preprocessor=TestPreprocessor(data_test)

In [18]:
variables_continues=["Fuel consumption ","Electric range (km)","ec (cm3)","z (Wh/km)","W (mm)"]

for col in variables_continues:
    train_preprocessor.outlier_detection(col)
    test_preprocessor.outlier_detection(col)

In [19]:
train_preprocessor.fill_fuel_consumption()
train_preprocessor.fill_electric_range()
train_preprocessor.fill_engine_capacity()
train_preprocessor.fill_electric_consumption()
train_preprocessor.fill_category_type()
train_preprocessor.fill_wheel_base()
train_preprocessor.fill_At_1()
train_preprocessor.fill_At_2()
train_preprocessor.fill_mass()
train_preprocessor.fill_engine_power()
train_preprocessor.encode_country()
train_preprocessor.encode_manufacture_pooling()
train_preprocessor.last_step()

test_preprocessor.fill_fuel_consumption()
test_preprocessor.fill_electric_range()
test_preprocessor.fill_engine_capacity()
test_preprocessor.fill_electric_consumption()
test_preprocessor.fill_category_type()
test_preprocessor.fill_wheel_base()
test_preprocessor.fill_At_1()
test_preprocessor.fill_At_2()
test_preprocessor.fill_mass()
test_preprocessor.fill_engine_power()
test_preprocessor.encode_country()
test_preprocessor.encode_manufacture_pooling()
test_preprocessor.last_step()

In [5]:
data_train = train_preprocessor.encode_that_var("Ct")
data_train = train_preprocessor.encode_that_var("Cr")
data_test = test_preprocessor.encode_that_var("Ct")
data_test = test_preprocessor.encode_that_var("Cr")

In [5]:
def create_conforme(df):
    df['conforme'] = df['Tan'].isna()
    df['conforme'] = df['conforme'].apply(lambda x: 1 if x==False else 0)
    df.drop(columns='Tan', inplace=True)
    pass
def compute_surface(obs):
    max_largeur= max(obs['At1 (mm)'], obs['At2 (mm)'])
    return obs['W (mm)']*obs['At1 (mm)'] if max_largeur == obs['At1 (mm)'] else obs['W (mm)'] * obs['At2 (mm)']

def create_surface(df):
    df['surface']= df.apply(compute_surface, axis=1)
    pass

def group_fuel_types(category: str):
    if category in ['PETROL/ELECTRIC', 'DIESEL/ELECTRIC']:
        return "HYBRID"
    elif category in ['NG-BIOMETHANE', 'HYDROGEN', 'NG','E85']:
        return "BIO-FUEL"
    elif category in ['PETROL','LPG'] :
        return 'PETROL'
    else:
        return category
def create_carburant(df):
    df['carburant']= df['Ft'].apply(group_fuel_types)
    df.drop(columns='Ft',inplace = True)

In [6]:
create_conforme(data_train)
create_surface(data_train)
create_carburant(data_train)

create_conforme(data_test)
create_surface(data_test)
create_carburant(data_test)

In [8]:
data_train = train_preprocessor.encode_that_var("carburant")
data_test = test_preprocessor.encode_that_var("carburant")

In [20]:
drop_this_cuz_no_use=['VFN', 'Mh', 'Man', 'T', 'Mk', 'Cn','Mt']
drop_this_cuz_personnal_choice = ['W (mm)', 'At1 (mm)', 'At2 (mm)','Fm','Erwltp (g/km)']

data_train.drop(columns=drop_this_cuz_no_use, inplace=True)
data_test.drop(columns=drop_this_cuz_no_use, inplace=True)

In [21]:
data_train.drop(columns='ID',inplace=True)

In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
from src.preprocessing import display_missing_values
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style="ticks", palette="pastel")

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# Random Forest models


## Random Forest (numerical)
No drop dupplicates, only keep numerical cols

MAE: 3.2843

In [12]:

columns_to_keep = ['m (kg)', 'Ewltp (g/km)', 'W (mm)', 'At1 (mm)', 'At2 (mm)', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)', 'Fuel consumption ', 'Electric range (km)']

train, test = train_test_split(data_train[data_train.columns.intersection(columns_to_keep)], test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [13]:
random_forest = RandomForestRegressor(random_state=42)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.2811834066371595


In [16]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test[data_test.columns.intersection(columns_to_keep)])

In [17]:
data_test[["ID","Ewltp (g/km)"]].to_csv("data/reg_rf_numerical_no_drop_dups.csv", index=False)

## RF numerical (new set of predictors)

In [13]:

columns_to_keep = ['m (kg)', 'Ewltp (g/km)', 'W (mm)', 'At1 (mm)', 'At2 (mm)', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)', 'Fuel consumption ']

train, test = train_test_split(data_train[data_train.columns.intersection(columns_to_keep)], test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [14]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.314147394462553


In [17]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test[data_test.columns.intersection(columns_to_keep)])

In [18]:
data_test[["ID","Ewltp (g/km)"]].to_csv("data/reg_rf_num_dups_no_e_range.csv", index=False)

## RF outliers numerical

In [16]:

columns_to_keep = ['m (kg)', 'Ewltp (g/km)', 'W (mm)', 'At1 (mm)', 'At2 (mm)', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)', 'Fuel consumption ', 'Electric range (km)']

train, test = train_test_split(data_train[data_train.columns.intersection(columns_to_keep)], test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [18]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.2811834066371603


In [19]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test[data_test.columns.intersection(columns_to_keep)])

In [20]:
data_test[["ID","Ewltp (g/km)"]].to_csv("data/reg_rf_outlier.csv", index=False)

## RF outliers numerical + country

In [9]:

columns_to_keep = ['m (kg)', 'Ewltp (g/km)', 'W (mm)', 'At1 (mm)',
                    'At2 (mm)', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)',
                      'Fuel consumption ', 'Electric range (km)','Country']

train, test = train_test_split(data_train[data_train.columns.intersection(columns_to_keep)], test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [10]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.185450881133795


In [13]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test[data_test.columns.intersection(columns_to_keep)])
data_test[["ID","Ewltp (g/km)"]].to_csv("data/reg_rf_country.csv", index=False)

### little test

In [9]:

columns_to_keep = ['m (kg)', 'Ewltp (g/km)', 'W (mm)', 'At1 (mm)',
                    'At2 (mm)', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)',
                      'Fuel consumption ', 'Electric range (km)','Country']

train, test = train_test_split(data_train[data_train.columns.intersection(columns_to_keep)], test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [10]:
random_forest = RandomForestRegressor(criterion="absolute_error", random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

KeyboardInterrupt: 

In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test[data_test.columns.intersection(columns_to_keep)])
data_test[["ID","Ewltp (g/km)"]].to_csv("data/reg_rf_outlier_country.csv", index=False)

## RF outliers numerical + Country + Mp

In [23]:

columns_to_keep = ['m (kg)','Mp', 'Ewltp (g/km)', 
                   'W (mm)', 'At1 (mm)', 'At2 (mm)',
                     'ec (cm3)', 'ep (KW)', 'z (Wh/km)', 
                     'Fuel consumption ', 'Electric range (km)','Country']

train, test = train_test_split(data_train[data_train.columns.intersection(columns_to_keep)], test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [24]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.1826152819810107


In [25]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test[data_test.columns.intersection(columns_to_keep)])
data_test[["ID","Ewltp (g/km)"]].to_csv("data/reg_rf_outlier_country_mp.csv", index=False)

## RF (numerical) + K fold

In [21]:
columns_to_keep_no_target = ['m (kg)', 'W (mm)', 'At1 (mm)',
                    'At2 (mm)', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)',
                      'Fuel consumption ', 'Electric range (km)','Country']

X_train = data_train[data_train.columns.intersection(columns_to_keep_no_target)]
y_train= data_train["Ewltp (g/km)"]

In [22]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=42, n_jobs=-1)

k_folds = 3

kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Créez des listes pour stocker les modèles et les scores
models = []
scores = []

# Effectuez la validation croisée
for train_index, test_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    # Entraînez le modèle sur X_train_fold et y_train_fold
    model.fit(X_train_fold, y_train_fold)

    # Évaluez le modèle sur X_val_fold (vous pouvez utiliser la métrique de votre choix, par exemple MAE)
    y_pred = model.predict(X_val_fold)
    mae = mean_absolute_error(y_val_fold, y_pred)
    
    # Stockez le modèle et le score
    models.append(model)
    scores.append(mae)

# Affichez les scores MAE pour chaque pli
for i, mae in enumerate(scores):
    print(f"Fold {i + 1} MAE: {mae}")

# Calculez la moyenne des scores MAE
average_mae = sum(scores) / k_folds
print(f"Average MAE: {average_mae}")

best_model_index = scores.index(min(scores))
best_model = models[best_model_index]

print(f"min MAE: {min(scores)}")

KeyboardInterrupt: 

In [None]:
data_test["Ewltp (g/km)"] = best_model.predict(data_test.drop(columns='ID'))

In [None]:
data_test[["ID","Ewltp (g/km)"]].to_csv("data/reg_rf_numerical_kf.csv", index=False)

## RF validation set 

In [26]:
columns_to_keep = ['m (kg)','Mp', 'Ewltp (g/km)', 
                   'W (mm)', 'At1 (mm)', 'At2 (mm)',
                     'ec (cm3)', 'ep (KW)', 'z (Wh/km)', 
                     'Fuel consumption ', 'Electric range (km)','Country']
train_data, rest_data = train_test_split(data_train[data_train.columns.intersection(columns_to_keep)], test_size=0.33, random_state=42)

validation_data, test_data = train_test_split(rest_data, test_size=0.5, random_state=42)

# Vérifiez les tailles des ensembles
print("Taille de l'ensemble d'entraînement :", len(train_data))
print("Taille de l'ensemble de validation :", len(validation_data))
print("Taille de l'ensemble de test :", len(test_data))


Taille de l'ensemble d'entraînement : 5073004
Taille de l'ensemble de validation : 1249322
Taille de l'ensemble de test : 1249323


In [27]:

# Divisez les ensembles en caractéristiques (X) et cible (y)
X_train, y_train = train_data.drop(columns=["Ewltp (g/km)"]), train_data["Ewltp (g/km)"]
X_val, y_val = validation_data.drop(columns=["Ewltp (g/km)"]), validation_data["Ewltp (g/km)"]
X_test, y_test = test_data.drop(columns=["Ewltp (g/km)"]), test_data["Ewltp (g/km)"]

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 7, 10],
}

# Créez votre modèle
base_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Utilisez GridSearchCV pour la recherche sur grille avec validation croisée
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=3)
grid_search.fit(X_val, y_val)

# Affichez les meilleurs hyperparamètres trouvés
best_params = grid_search.best_params_
print("Meilleurs hyperparamètres :", best_params)

# Utilisez le meilleur modèle trouvé sur l'ensemble de test
best_model = grid_search.best_estimator_
y_test_pred = best_model.predict(X_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
print("MAE sur l'ensemble de test avec le meilleur modèle :", mae_test)



Meilleurs hyperparamètres : {'max_depth': 10, 'n_estimators': 150}
MAE sur l'ensemble de test avec le meilleur modèle : 10.80078252858729


## RF with test_fonctions script

### Import

In [1]:
from test_fonctions import Dataset
from src.preprocessing import display_missing_values

In [2]:
train=Dataset("data/train.csv")
data_train=train.load_data()
test=Dataset("data/test.csv")
data_test=test.load_data()

In [3]:
from test_fonctions import Preprocessor,TrainPreprocessor,TestPreprocessor
train_preprocessor=TrainPreprocessor(data_train)
test_preprocessor=TestPreprocessor(data_test)

In [4]:
variables_continues=["m (kg)","Mt","W (mm)","At1 (mm)", "At2 (mm)", 
                     "ec (cm3)", "ep (KW)", "z (Wh/km)","Fuel consumption ",
                     "Electric range (km)"]

for col in variables_continues:
    train_preprocessor.outlier_detection(col)
    test_preprocessor.outlier_detection(col)

In [5]:
train_preprocessor.fill_fuel_consumption()
train_preprocessor.fill_electric_range()
train_preprocessor.fill_engine_capacity()
train_preprocessor.fill_electric_consumption()
train_preprocessor.fill_category_type()
train_preprocessor.fill_wheel_base()
train_preprocessor.fill_At_1()
train_preprocessor.fill_At_2()
train_preprocessor.fill_mass()
train_preprocessor.fill_test_mass()
train_preprocessor.fill_fuel_mode()
train_preprocessor.fill_engine_power()
train_preprocessor.encode_country()
train_preprocessor.encode_manufacture_pooling()
train_preprocessor.encode_fuel_mode()
train_preprocessor.encode_fuel_type()
train_preprocessor.last_step()

test_preprocessor.fill_fuel_consumption()
test_preprocessor.fill_electric_range()
test_preprocessor.fill_engine_capacity()
test_preprocessor.fill_electric_consumption()
test_preprocessor.fill_category_type()
test_preprocessor.fill_wheel_base()
test_preprocessor.fill_At_1()
test_preprocessor.fill_At_2()
test_preprocessor.fill_mass()
test_preprocessor.fill_test_mass()
test_preprocessor.fill_fuel_mode()
test_preprocessor.fill_engine_power()
test_preprocessor.encode_country()
test_preprocessor.encode_manufacture_pooling()
test_preprocessor.encode_fuel_mode()
test_preprocessor.encode_fuel_type()
test_preprocessor.last_step()

In [6]:
def create_conforme(df):
    df['conforme'] = df['Tan'].isna()
    df['conforme'] = df['conforme'].apply(lambda x: 1 if x==False else 0)
    df.drop(columns='Tan', inplace=True)
    pass
def compute_surface(obs):
    max_largeur= max(obs['At1 (mm)'], obs['At2 (mm)'])
    return obs['W (mm)']*obs['At1 (mm)'] if max_largeur == obs['At1 (mm)'] else obs['W (mm)'] * obs['At2 (mm)']

def create_surface(df):
    df['surface']= df.apply(compute_surface, axis=1)
    pass

create_conforme(data_train)
create_surface(data_train)

create_conforme(data_test)
create_surface(data_test)

In [7]:
drop_this_cuz_no_use=['VFN', 'Mh', 'Man', 'T', 'Mk', 'Cn']
# drop_this_cuz_personnal_choice = ['W (mm)', 'At1 (mm)', 'At2 (mm)','Fm','Erwltp (g/km)']

data_train.drop(columns=drop_this_cuz_no_use, inplace=True)
data_test.drop(columns=drop_this_cuz_no_use, inplace=True)

data_train.drop(columns='ID',inplace=True)

In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
from src.preprocessing import display_missing_values
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style="ticks", palette="pastel")

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

### Model testing

In [9]:
data_train.drop(columns=['Ct','Cr','Erwltp (g/km)'], inplace=True)
data_test.drop(columns=['Ct','Cr','Erwltp (g/km)'], inplace=True)

In [10]:
data_train.columns

Index(['Country', 'Mp', 'm (kg)', 'Mt', 'Ewltp (g/km)', 'W (mm)', 'At1 (mm)',
       'At2 (mm)', 'Ft', 'Fm', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)',
       'Fuel consumption ', 'Electric range (km)', 'flag_Fuel consumption ',
       'flag_Electric range (km)', 'flag_ec (cm3)', 'flag_z (Wh/km)',
       'flag_W (mm)', 'flag_Mt', 'conforme', 'surface'],
      dtype='object')

In [11]:
train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [12]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.976801154055315


In [13]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("data/rf_num_and_cat.csv", index=False)

In [24]:
random_forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

### Polynomial


In [10]:
variables_continues=["m (kg)","Mt","W (mm)","At1 (mm)", "At2 (mm)", 
                     "ec (cm3)", "ep (KW)", "z (Wh/km)","Fuel consumption ",
                     "Electric range (km)","surface"]

others= [col for col in data_train.columns if col not in variables_continues]

In [11]:
poly = PolynomialFeatures(degree=2,)

# Application de la transformation aux variables continues
variables_continues_transformees = poly.fit_transform(data_train[variables_continues])

data_train_transforme = pd.DataFrame(variables_continues_transformees, columns=poly.get_feature_names_out(variables_continues))
data_train= pd.concat([data_train[others],data_train_transforme], axis=1)

In [12]:
others= [col for col in data_test.columns if col not in variables_continues]

variables_continues_transformees = poly.fit_transform(data_test[variables_continues])

data_test_transforme = pd.DataFrame(variables_continues_transformees, columns=poly.get_feature_names_out(variables_continues))
data_test= pd.concat([data_test[list(filter(lambda x: x !='Ewltp (g/km)',others))],data_train_transforme], axis=1)

In [13]:
train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [14]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.9867784350291724


In [16]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test.drop(columns='ID'))
data_test[["ID","Ewltp (g/km)"]].to_csv("data/rf_poly2_num_and_cat.csv", index=False)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### Another RF test

Tourne en 21 min. Ccl: les variables flags sont pas forcément géniales à avoir.

In [10]:

columns_to_keep = ['Country', 'Mp', 'm (kg)', 'Mt', 'Ewltp (g/km)', 'W (mm)', 'At1 (mm)',
       'At2 (mm)', 'Ft', 'Fm', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)',
       'Fuel consumption ', 'Electric range (km)', 'conforme', 'surface']

train, test = train_test_split(data_train[data_train.columns.intersection(columns_to_keep)], test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [11]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.9767661924047735


In [13]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test[data_test.columns.intersection(columns_to_keep)])
data_test[["ID","Ewltp (g/km)"]].to_csv("data/rf_no_flag_num_cat.csv", index=False)

## RF

No drop duplicates, outliers treatment, using numericals. 

Ordinal Encoding  pour: "Ct", "Cr","Fm"

Label Encoding pour : "Country","Mp"

Utilisation colonnes surface, conforme. Pas d'utilisation des colonnes flag (outliers = 1 else 0)

### Import + Preprocessing

In [1]:
from fonctions import Dataset
from src.preprocessing import display_missing_values

train=Dataset("data/train.csv")
data_train=train.load_data()
test=Dataset("data/test.csv")
data_test=test.load_data()

from fonctions import TrainPreprocessor,TestPreprocessor
train_preprocessor=TrainPreprocessor(data_train)
test_preprocessor=TestPreprocessor(data_test)

In [2]:
variables_continues=["m (kg)","Mt","W (mm)","At1 (mm)", "At2 (mm)", 
                     "ec (cm3)", "ep (KW)", "z (Wh/km)","Fuel consumption ",
                     "Electric range (km)"]

for col in variables_continues:
    train_preprocessor.outlier_detection(col)
    test_preprocessor.outlier_detection(col)

In [3]:
train_preprocessor.fill_fuel_consumption()
train_preprocessor.fill_electric_range()
train_preprocessor.fill_engine_capacity()
train_preprocessor.fill_electric_consumption()
train_preprocessor.fill_category_type()
train_preprocessor.fill_wheel_base()
train_preprocessor.fill_At_1()
train_preprocessor.fill_At_2()
train_preprocessor.fill_mass()
train_preprocessor.fill_test_mass()
train_preprocessor.fill_fuel_mode()
train_preprocessor.fill_engine_power()
train_preprocessor.encode_country()
train_preprocessor.encode_manufacture_pooling()
train_preprocessor.encode_category_registered() #new
train_preprocessor.encode_category_type() #new
train_preprocessor.encode_fuel_mode()
train_preprocessor.encode_fuel_type()
train_preprocessor.last_step()

test_preprocessor.fill_fuel_consumption()
test_preprocessor.fill_electric_range()
test_preprocessor.fill_engine_capacity()
test_preprocessor.fill_electric_consumption()
test_preprocessor.fill_category_type()
test_preprocessor.fill_wheel_base()
test_preprocessor.fill_At_1()
test_preprocessor.fill_At_2()
test_preprocessor.fill_mass()
test_preprocessor.fill_test_mass()
test_preprocessor.fill_fuel_mode()
test_preprocessor.fill_engine_power()
test_preprocessor.encode_country()
test_preprocessor.encode_manufacture_pooling()
test_preprocessor.encode_category_registered() #new
test_preprocessor.encode_category_type() #new
test_preprocessor.encode_fuel_mode()
test_preprocessor.encode_fuel_type()
test_preprocessor.last_step()

In [4]:
def create_conforme(df):
    df['conforme'] = df['Tan'].isna()
    df['conforme'] = df['conforme'].apply(lambda x: 1 if x==False else 0)
    df.drop(columns='Tan', inplace=True)
    pass
def compute_surface(obs):
    max_largeur= max(obs['At1 (mm)'], obs['At2 (mm)'])
    return obs['W (mm)']*obs['At1 (mm)'] if max_largeur == obs['At1 (mm)'] else obs['W (mm)'] * obs['At2 (mm)']

def create_surface(df):
    df['surface']= df.apply(compute_surface, axis=1)
    pass

# Feature Engineering
create_conforme(data_train)
create_surface(data_train)

create_conforme(data_test)
create_surface(data_test)

In [5]:
drop_this_cuz_no_use=['VFN', 'Mh', 'Man', 'T', 'Mk', 'Cn','Erwltp (g/km)']

data_train.drop(columns=drop_this_cuz_no_use, inplace=True)
data_test.drop(columns=drop_this_cuz_no_use, inplace=True)

data_train.drop(columns='ID',inplace=True)

### Model Testing

In [6]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [7]:
data_train.columns

Index(['Country', 'Mp', 'Ct', 'Cr', 'm (kg)', 'Mt', 'Ewltp (g/km)', 'W (mm)',
       'At1 (mm)', 'At2 (mm)', 'Ft', 'Fm', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)',
       'Fuel consumption ', 'Electric range (km)', 'flag_m (kg)', 'flag_Mt',
       'flag_W (mm)', 'flag_At1 (mm)', 'flag_At2 (mm)', 'flag_ec (cm3)',
       'flag_ep (KW)', 'flag_z (Wh/km)', 'flag_Fuel consumption ',
       'flag_Electric range (km)', 'conforme', 'surface'],
      dtype='object')

In [8]:

columns_to_keep = ['Country', 'Mp', 'Ct', 'Cr', 'm (kg)', 'Mt', 'Ewltp (g/km)', 'W (mm)',
       'At1 (mm)', 'At2 (mm)', 'Ft', 'Fm', 'ec (cm3)', 'ep (KW)', 'z (Wh/km)',
       'Fuel consumption ', 'Electric range (km)', 'conforme', 'surface']

train, test = train_test_split(data_train[data_train.columns.intersection(columns_to_keep)], test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [9]:
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 2.974584586928926


In [None]:
data_test["Ewltp (g/km)"] = random_forest.predict(data_test[data_test.columns.intersection(columns_to_keep)])
data_test[["ID","Ewltp (g/km)"]].to_csv("data/rf_more_encoded.csv", index=False)

# XGboost

In [None]:
train, test = train_test_split(data_train, test_size=0.33, random_state=42)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

X_train, y_train = train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test = test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [10]:
train, test = train_test_split(data_train,test_size=0.33,random_state=42)

train.reset_index(drop=True, inplace=True ) #car ça fout la merde dans l'index
test.reset_index(drop = True, inplace = True)

X_train, y_train =train.drop(columns=["Ewltp (g/km)"]), train["Ewltp (g/km)"]
X_test, y_test =test.drop(columns=["Ewltp (g/km)"]), test["Ewltp (g/km)"]

In [11]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

# Créez un modèle XGBoost
xgb_model = XGBRegressor(objective="reg:squarederror", random_state=42)

# Liste des hyperparamètres à régler et leurs plages de valeurs
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'n_jobs':[-1,-1,-1]
}

# Recherche aléatoire des hyperparamètres
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, random_state=42)

# Exécutez la recherche aléatoire sur les données d'entraînement
random_search.fit(X_train, y_train)

# Obtenez les meilleurs hyperparamètres
best_params = random_search.best_params_
print("Meilleurs hyperparamètres :", best_params)

# Utilisez les meilleurs hyperparamètres pour entraîner le modèle final
best_xgb_model = XGBRegressor(objective="reg:squarederror", random_state=42, **best_params)
best_xgb_model.fit(X_train, y_train)

y_pred =best_xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)

Meilleurs hyperparamètres : {'n_jobs': -1, 'n_estimators': 300, 'min_child_weight': 2, 'max_depth': 5, 'learning_rate': 0.2}
Mean Absolute Error (MAE): 6.035682412861069


# Gradient boosting

In [14]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [20]:
hist_gradient_boosting = HistGradientBoostingRegressor(interaction_cst="pairwise",warm_start=True,
                                                        learning_rate=0.3, random_state=42,
                                                        categorical_features=['Country', 'Mp','Ft'])

# Entraîner le modèle sur l'ensemble d'entraînement
hist_gradient_boosting.fit(X_train, y_train)

# Faire des prédictions sur l'ensemble de test
y_pred = hist_gradient_boosting.predict(X_test)

# Calculer la MAE
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


Mean Absolute Error (MAE): 7.848606144154054


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(data_train.drop(columns='Ewltp (g/km)'), data_train['Ewltp (g/km)']
                                                    , test_size=0.33, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingRegressor

# Définir la grille des hyperparamètres
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2,0.3,0.8,0.9],
    'max_depth': [3, 5, 7,10,12,20],
    'max_iter': [100, 200, 300,400,500,600],
    'min_samples_leaf': [1, 2, 4,6,7,9],
    'l2_regularization': [0.0, 0.1, 0.2,0.4,0.6,0.8]
}

# Initialiser le modèle
hgb_regressor = HistGradientBoostingRegressor(random_state=42)

# Utiliser la recherche aléatoire
random_search = RandomizedSearchCV(hgb_regressor, param_distributions=param_grid, n_iter=10, cv=3, scoring='neg_mean_absolute_error', random_state=42)

# Exécuter la recherche
random_search.fit(X_train, y_train)




In [22]:
y_pred = random_search.predict(X_test)

# Calculer la MAE
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


Mean Absolute Error (MAE): 4.0602364248951


In [23]:
random_search.best_params_

{'min_samples_leaf': 4,
 'max_iter': 600,
 'max_depth': 20,
 'learning_rate': 0.3,
 'l2_regularization': 0.0}

# Réseau de neurones pour le fun

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error
from tensorflow import keras
from tensorflow.keras import layers

# Charger vos données
# Assurez-vous que 'Country', 'Mp', 'Ft' sont déjà label encodés

# Séparation des features et de la target
X = data_train.drop(columns=["Ewltp (g/km)"])
y = data_train["Ewltp (g/km)"]

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalisation des données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Définition du modèle
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Couche de sortie pour la régression
])

# Compilation du modèle
model.compile(optimizer='adam', loss='mean_squared_error')

# Entraînement du modèle
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Évaluation sur l'ensemble de test
y_pred = model.predict(X_test_scaled).flatten()

# Calcul de la MAE
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Absolute Error (MAE): 7.006961559326027
