# Preprocessing
# Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


## Import dataset

In [2]:
df = pd.read_csv("regularite-mensuelle-tgv-aqst.csv",encoding="latin-1")

## Overview

In [3]:
df.columns

Index(['date', 'service', 'gare_depart', 'gare_arrivee', 'duree_moyenne',
       'nb_train_prevu', 'nb_annulation', 'commentaire_annulation',
       'nb_train_depart_retard', 'retard_moyen_depart',
       'retard_moyen_tous_trains_depart', 'commentaire_retards_depart',
       'nb_train_retard_arrivee', 'retard_moyen_arrivee',
       'retard_moyen_tous_trains_arrivee', 'commentaires_retard_arrivee',
       'nb_train_retard_sup_15', 'retard_moyen_trains_retard_sup15',
       'nb_train_retard_sup_30', 'nb_train_retard_sup_60',
       'prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic',
       'prct_cause_materiel_roulant', 'prct_cause_gestion_gare',
       'prct_cause_prise_en_charge_voyageurs'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,commentaire_annulation,nb_train_depart_retard,retard_moyen_depart,...,nb_train_retard_sup_15,retard_moyen_trains_retard_sup15,nb_train_retard_sup_30,nb_train_retard_sup_60,prct_cause_externe,prct_cause_infra,prct_cause_gestion_trafic,prct_cause_materiel_roulant,prct_cause_gestion_gare,prct_cause_prise_en_charge_voyageurs
0,1/1/2018,National,BORDEAUX ST JEAN,PARIS MONTPARNASSE,141,870,5,,289,11.247809,...,110,6.511118,44,8,36.134454,31.092437,10.92437,15.966387,5.042017,0.840336
1,1/1/2018,National,LA ROCHELLE VILLE,PARIS MONTPARNASSE,165,222,0,,8,2.875,...,22,5.696096,5,0,15.384615,30.769231,38.461538,11.538462,3.846154,0.0
2,1/1/2018,National,PARIS MONTPARNASSE,QUIMPER,220,248,1,,37,9.501351,...,26,7.548387,17,7,26.923077,38.461538,15.384615,19.230769,0.0,0.0
3,1/1/2018,National,PARIS MONTPARNASSE,ST MALO,156,102,0,,12,19.9125,...,8,6.724757,6,4,23.076923,46.153846,7.692308,15.384615,7.692308,0.0
4,1/1/2018,National,PARIS MONTPARNASSE,ST PIERRE DES CORPS,61,391,2,,61,7.796995,...,17,3.346487,6,0,21.212121,42.424242,9.090909,21.212121,6.060606,0.0


In [5]:
(df.isna().sum()/df.shape[0]).sort_values(ascending=True)  # Nan only in the comment columns

date                                    0.000000
prct_cause_materiel_roulant             0.000000
prct_cause_gestion_trafic               0.000000
prct_cause_infra                        0.000000
prct_cause_externe                      0.000000
nb_train_retard_sup_60                  0.000000
nb_train_retard_sup_30                  0.000000
retard_moyen_trains_retard_sup15        0.000000
nb_train_retard_sup_15                  0.000000
retard_moyen_tous_trains_arrivee        0.000000
retard_moyen_arrivee                    0.000000
prct_cause_gestion_gare                 0.000000
nb_train_retard_arrivee                 0.000000
retard_moyen_tous_trains_depart         0.000000
retard_moyen_depart                     0.000000
nb_train_depart_retard                  0.000000
nb_annulation                           0.000000
nb_train_prevu                          0.000000
duree_moyenne                           0.000000
gare_arrivee                            0.000000
gare_depart         

In [6]:
df[df.duplicated()]  # No duplicated rows

Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,commentaire_annulation,nb_train_depart_retard,retard_moyen_depart,...,nb_train_retard_sup_15,retard_moyen_trains_retard_sup15,nb_train_retard_sup_30,nb_train_retard_sup_60,prct_cause_externe,prct_cause_infra,prct_cause_gestion_trafic,prct_cause_materiel_roulant,prct_cause_gestion_gare,prct_cause_prise_en_charge_voyageurs


## Remove non-predictible features/check for errors

In [7]:
columns_cause = ['prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic',
       'prct_cause_materiel_roulant', 'prct_cause_gestion_gare',
       'prct_cause_prise_en_charge_voyageurs']

columns_retard = ['retard_moyen_depart',
       'retard_moyen_tous_trains_depart', 'commentaire_retards_depart',
       'nb_train_retard_arrivee','retard_moyen_tous_trains_arrivee', 'commentaires_retard_arrivee',
       'nb_train_retard_sup_15', 'retard_moyen_trains_retard_sup15',
       'nb_train_retard_sup_30', 'nb_train_retard_sup_60',"nb_train_depart_retard"]

other_columns = ['nb_annulation', 'commentaire_annulation','duree_moyenne']

# "duree_moyenne" could be a useful feature with a few feature engineering 
# (estimating the mean for every line and add the value in a new feature)
# This could also be done with the "retard" features, but there are highly correlated to the target
# Maybe this could be done for the "cause" features?

def clean_dataset(df,other_columns,columns_retard):
    columns_to_remove = other_columns + columns_retard
    df = df.drop(columns_to_remove, axis=1)
    df = df.drop([2886,2889],axis = 0)  # Remove outliers isolated in the next cell
    df = df.reset_index(drop = True)
    for i, d in enumerate(df["date"].tolist()):  # Remove first lockdown
        month, day, year = d.split('/')
        if (int(year) == 2020 and int(month) in [3,4,5]):
              df = df.drop(i,axis = 0)
    return df

def check_errors(df,columns_cause):
    print("number of non-plausible values:")
    print(len(df[df["duree_moyenne"]<0]))
    print(len(df[df["nb_train_prevu"]<0]))
    print(len(df[df["retard_moyen_arrivee"]<0]))
    for col in columns_cause:
       print(len(df[(df[col]<0) | (df[col]>100)]))

check_errors(df,columns_cause)       
clean_df = clean_dataset(df,other_columns,columns_retard,)  # Test
# Dataset cleaning should be done on train and test set separately -> ensure reproducibility


number of non-plausible values:
0
0
2
0
0
0
0
0
0


In [8]:
df[df["retard_moyen_arrivee"]<0] #  Something happened this month?

# I dont' see any valuable reason that would explain these outlier, we can remove or impute them
# TODO check covid period and try impute the previous outlier

Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,commentaire_annulation,nb_train_depart_retard,retard_moyen_depart,...,nb_train_retard_sup_15,retard_moyen_trains_retard_sup15,nb_train_retard_sup_30,nb_train_retard_sup_60,prct_cause_externe,prct_cause_infra,prct_cause_gestion_trafic,prct_cause_materiel_roulant,prct_cause_gestion_gare,prct_cause_prise_en_charge_voyageurs
2886,11/1/2019,National,MONTPELLIER,PARIS LYON,380,227,11,,189,4.910406,...,44,34.677381,18,3,52.272727,13.636364,15.909091,15.909091,2.272727,0.0
2889,11/1/2019,National,NIMES,PARIS LYON,224,226,11,,190,8.765614,...,44,34.677381,18,3,46.774194,17.741935,14.516129,12.903226,3.225806,4.83871


## Train-test split

In [9]:
def get_train_test_set(df):
    train_idx = []
    test_idx = []
    for i, d in enumerate(df["date"].tolist()):
        month, day, year = d.split('/')
        if int(year)<2023:
            train_idx.append(i)
        else:
            test_idx.append(i)
    
    train_set = df.iloc[train_idx].copy(deep=True)
    test_set = df.iloc[test_idx].copy(deep=True)
    
    return train_set, test_set

trainset, testset = get_train_test_set(clean_df)

## Preprocessing (encoding/scaling)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelBinarizer,OrdinalEncoder,MinMaxScaler,Normalizer,RobustScaler
from sklearn.feature_extraction import FeatureHasher

def manage_date_column(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month.apply(str).apply(lambda x:[x])
    df = df.drop("date",axis=1)
    return df

def preprocessing(df1,target,estimated_retard_moyen = False):
    df = df1.copy()
    df = manage_date_column(df)
    df["gare_arrivee"] = df["gare_arrivee"].apply(lambda x:[x])
    df["gare_depart"] = df["gare_depart"].apply(lambda x:[x])
    
    df["ligne"] = df.apply(lambda x:x["gare_arrivee"]+x["gare_depart"],axis = 1)
    df = df.drop(["gare_arrivee","gare_depart"],axis = 1)
    
    scaling_cols = ["nb_train_prevu"]
    hash_cols1 = "month"
    hash_cols2 = ["ligne"]  # "gare_depart","gare_arrivee"
    binarizer_cols = []
    onehot_cols = ["service"]
    ordinal_encode_cols = []
        
    if estimated_retard_moyen:
        scaling_cols += ["estimated_retard_moyen"]
        
    y = df[target] 
    X = df.drop(target,axis = 1)
           
    binarizer_transformer = Pipeline(steps=[
        ('binarizer',LabelBinarizer())])
    hash_transformer1 = Pipeline(steps=[
        ('hashing', FeatureHasher(n_features=4,input_type = "string"))]) # For month
    hash_transformer2 = Pipeline(steps=[
        ('hashing2', FeatureHasher(n_features=16,input_type = "string"))]) # For stations
    numeric_transformer = Pipeline(steps=[
        ('scaler', RobustScaler())])
    onehot_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder())])
    ordinal_encode_transformer = Pipeline(steps=[
        ('ordinal_encode', OrdinalEncoder())])

    transformers=[
            # ('cat', binarizer_transformer,binarizer_cols),
            ('hash', hash_transformer1, hash_cols1)]
    
    for i in range(len(hash_cols2)):
        transformers.append(('hash'+str(i), hash_transformer2, hash_cols2[i]))
     
    transformers += [('num', numeric_transformer, scaling_cols),
            ('one', onehot_transformer, onehot_cols),
            # ('ord', ordinal_encode_transformer, ordinal_encode_cols)
            ]

    # print(transformers) 
    preprocessor = ColumnTransformer(
        transformers=transformers
        #remainder = 'passthrough', # Will cause undesirerable columns to stay in X_transformed
        )
    
    X_transformed = preprocessor.fit_transform(X).todense()
    
    return X_transformed,y

target = "retard_moyen_arrivee"
# ['prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic','prct_cause_materiel_roulant', 'prct_cause_gestion_gare','prct_cause_prise_en_charge_voyageurs']

X_train,y_train = preprocessing(trainset,target)
X_test,y_test = preprocessing(testset,target)


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error

model = LinearRegression()
model.fit(np.asarray(X_train),np.asarray(y_train))
y_pred = model.predict(np.asarray(X_test))

print(mean_absolute_error(y_pred,y_test))
print(mean_squared_error(y_test,y_pred))

10.600867455016962
345.20165594816774


# Models

## Import libraries 

In [12]:
from sklearn import neighbors

from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression

from sklearn.svm import SVR

from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, cross_validate

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

## kNN
### Define variables

In [13]:
n_neighbors = 1000

X = np.asarray(X_train)
y = np.asarray(y_train)
T = np.asarray(X_test)

### Train and test model

In [14]:
model_list = []
scores_list = []

for i in range(n_neighbors):
    knn = neighbors.KNeighborsRegressor(i+1, weights='distance')
    scores = cross_val_score(knn, X, y, cv=5, scoring='neg_mean_squared_error')

    model_list.append(knn)
    scores_list.append(scores)

### Find best score

In [15]:
best_model = model_list[0]
best_score_mean = scores_list[0].mean()

for i in range(1, len(model_list)):
    score_mean = scores_list[i].mean()

    if score_mean > best_score_mean:
        best_score_mean = score_mean
        best_model = model_list[i]

print(best_model, best_score_mean)

KNeighborsRegressor(n_neighbors=290, weights='distance') -226.04317388223672


### Test with 5 neighbors

In [16]:
best_model.fit(X,y)
best_model.predict(T)
pred_knn = best_model.predict(T)
print("MSE: ",mean_squared_error(y_test, pred_knn))
print("MAE: ",mean_absolute_error(y_test, pred_knn))
print("R2: ",r2_score(y_test, pred_knn))

MSE:  340.13917802800484
MAE:  10.409088028485911
R2:  0.04695801864961835


## SVR
### Define model

In [17]:
svr = SVR()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(svr, param_grid={'kernel': ['linear', 'rbf', 'poly'], 'C': [0.1, 1, 10]}, cv=kf, scoring='neg_mean_squared_error')

X = np.asarray(X_train)
y = np.asarray(y_train)
T = np.asarray(X_test)

### Fit model

In [18]:
grid_search.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=SVR(),
             param_grid={'C': [0.1, 1, 10],
                         'kernel': ['linear', 'rbf', 'poly']},
             scoring='neg_mean_squared_error')

### Calculate score
#### Define score model

In [19]:
best_params = grid_search.best_params_
best_svr = SVR(kernel=best_params['kernel'], C=best_params['C'])

#### Fit score model

In [20]:
best_svr.fit(X, y)

SVR(C=10)

#### Calculate prediction and deduce score

In [21]:
pred_svr = best_svr.predict(T)
print("MSE: ",mean_squared_error(y_test, pred_svr))
print("MAE: ",mean_absolute_error(y_test, pred_svr))
print("R2: ",r2_score(y_test, pred_svr))

MSE:  352.82268314889524
MAE:  10.40045922772035
R2:  0.01141988122904214


## Bagging, Random Forest, Extra Trees
### Utility functions

In [22]:
def calculate_metrics(y_true, y_pred):
    return r2_score(y_true, y_pred), mean_absolute_error(y_true, y_pred), mean_squared_error(y_true, y_pred)

def add_value_in_dict(dict_metrics, model_name, score_metrics):
    dict_metrics[model_name] = dict()
    dict_metrics[model_name]["r2_score"] = score_metrics[0]
    dict_metrics[model_name]["mean_absolute_error"] = score_metrics[1]
    dict_metrics[model_name]["mean_squared_error"] = score_metrics[2]

def values_from_cross_validate(model, X_train, y_train):
    scores = cross_validate(model, X_train, y_train, cv=5,
                                 scoring=('r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'))
    
    del scores['fit_time']
    del scores['score_time']

    for key, value in scores.items():
        scores[key] = [scores[key].mean(), scores[key].std()]
    
    return scores

def grid_search(regressor, param_grid, X, y):
    gs = GridSearchCV(regressor, param_grid, cv=5, scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], refit='neg_mean_squared_error', n_jobs=-1)
    gs.fit(X, y)

    best_params = gs.best_params_
    best_regressor = gs.best_estimator_
    
    print("Best Parameters: ", best_params)
    print("Best Score (neg_mean_squared_error): ", gs.best_score_)

    return best_regressor

### Params

In [23]:
param_grid_adaboost = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'loss': ['linear', 'square', 'exponential']
}
param_grid_bagging = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0],
}
param_grid_extratrees = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
param_grid_gradientboosting = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
param_grid_randomforest = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dict_metrics = dict()

### GridSearchCV

In [24]:
model_adaboost = grid_search(AdaBoostRegressor(), param_grid_adaboost, X_train, y_train)
model_bagging = grid_search(BaggingRegressor(), param_grid_bagging, X_train, y_train)
model_extratrees = grid_search(ExtraTreesRegressor(), param_grid_extratrees, X_train, y_train)
model_gradientboosting = grid_search(GradientBoostingRegressor(), param_grid_gradientboosting, X_train, y_train)
model_randomforest = grid_search(RandomForestRegressor(), param_grid_randomforest, X_train, y_train)



Best Parameters:  {'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 50}
Best Score (neg_mean_squared_error):  -198.13809528802622




Best Parameters:  {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 100}
Best Score (neg_mean_squared_error):  -191.3773816199562




Best Parameters:  {'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best Score (neg_mean_squared_error):  -185.40932423782485




Best Parameters:  {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best Score (neg_mean_squared_error):  -187.66725004138215




Best Parameters:  {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Best Score (neg_mean_squared_error):  -191.26520038067073


### Metrics

In [25]:
add_value_in_dict(dict_metrics, "adaboost", calculate_metrics(y_test, model_adaboost.predict(X_test)))
add_value_in_dict(dict_metrics, "bagging", calculate_metrics(y_test, model_bagging.predict(X_test)))
add_value_in_dict(dict_metrics, "extratrees", calculate_metrics(y_test, model_extratrees.predict(X_test)))
add_value_in_dict(dict_metrics, "gradientboosting", calculate_metrics(y_test, model_gradientboosting.predict(X_test)))
add_value_in_dict(dict_metrics, "randomforest", calculate_metrics(y_test, model_randomforest.predict(X_test)))



In [26]:
dict_metrics

{'adaboost': {'r2_score': 0.0328915073316971,
  'mean_absolute_error': 10.517961818278103,
  'mean_squared_error': 345.15949370247284},
 'bagging': {'r2_score': 0.05260202472105713,
  'mean_absolute_error': 10.314110416973085,
  'mean_squared_error': 338.12484117454943},
 'extratrees': {'r2_score': 0.0657835709648017,
  'mean_absolute_error': 10.061719654838843,
  'mean_squared_error': 333.42036813745136},
 'gradientboosting': {'r2_score': 0.06532150483035293,
  'mean_absolute_error': 10.041153296806657,
  'mean_squared_error': 333.5852788111063},
 'randomforest': {'r2_score': 0.06375881264350991,
  'mean_absolute_error': 10.153608757078258,
  'mean_squared_error': 334.14300118466895}}

### Analysis

On prend le modèle adaboost qui donne le meilleur MSE<br>
On rajoute les valeurs prédites au testset pour prédire les causes

In [27]:
testset["retard_moyen_arrivee"] = model_adaboost.predict(X_test)
trainset.rename(columns={"retard_moyen_arrivee": "estimated_retard_moyen"}, inplace=True)
testset.rename(columns={"retard_moyen_arrivee": "estimated_retard_moyen"}, inplace=True)



In [28]:
testset

Unnamed: 0,date,service,gare_depart,gare_arrivee,nb_train_prevu,estimated_retard_moyen,prct_cause_externe,prct_cause_infra,prct_cause_gestion_trafic,prct_cause_materiel_roulant,prct_cause_gestion_gare,prct_cause_prise_en_charge_voyageurs
7426,1/1/2023,International,PARIS EST,STUTTGART,150,31.448081,6.250000,3.125000,34.375000,21.875000,21.875000,12.500000
7427,1/1/2023,International,PARIS LYON,ZURICH,129,38.459434,15.000000,15.000000,35.000000,15.000000,5.000000,15.000000
7428,1/1/2023,National,BORDEAUX ST JEAN,PARIS MONTPARNASSE,1075,27.483933,26.811594,27.536232,16.666667,11.594203,5.072464,12.318841
7429,1/1/2023,National,CHAMBERY CHALLES LES EAUX,PARIS LYON,334,38.459434,23.943662,29.577465,36.619718,2.816901,0.000000,7.042254
7430,1/1/2023,National,MACON LOCHE,PARIS LYON,252,38.459434,17.391304,24.637681,42.028986,5.797101,0.000000,10.144928
...,...,...,...,...,...,...,...,...,...,...,...,...
8147,6/1/2023,National,STRASBOURG,PARIS EST,492,30.987977,7.619048,5.714286,55.238095,18.095238,7.619048,5.714286
8148,6/1/2023,National,TOULOUSE MATABIAU,PARIS MONTPARNASSE,215,34.357469,13.888889,33.333333,8.333333,19.444444,8.333333,16.666667
8149,6/1/2023,National,TOURS,PARIS MONTPARNASSE,192,27.739120,14.285714,21.428571,28.571429,21.428571,3.571429,10.714286
8150,6/1/2023,National,VALENCE ALIXAN TGV,PARIS LYON,440,38.459434,26.724138,19.827586,27.586207,7.758621,7.758621,10.344828


In [29]:
target = ['prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic','prct_cause_materiel_roulant', 'prct_cause_gestion_gare','prct_cause_prise_en_charge_voyageurs']

X_train,y_train = preprocessing(trainset,target,estimated_retard_moyen=True)
X_test,y_test = preprocessing(testset,target,estimated_retard_moyen=True)

On veut prédire 6 valeurs (les valeurs de probabilité des causes de retard), cependant nous ne pouvons pas utiliser les régresseurs AdaBoost et GradientBoosting.

In [30]:
dict_metrics_cause = dict()

In [31]:
model_cause_bagging = grid_search(BaggingRegressor(), param_grid_bagging, X_train, y_train)
model_cause_extratrees = grid_search(ExtraTreesRegressor(), param_grid_extratrees, X_train, y_train)
model_cause_randomforest = grid_search(RandomForestRegressor(), param_grid_randomforest, X_train, y_train)



Best Parameters:  {'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 100}
Best Score (neg_mean_squared_error):  -174.64071509443676




Best Parameters:  {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best Score (neg_mean_squared_error):  -173.81416545860094




Best Parameters:  {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Score (neg_mean_squared_error):  -174.85864480093005


In [32]:
add_value_in_dict(dict_metrics_cause, "bagging", calculate_metrics(y_test, model_cause_bagging.predict(X_test)))
add_value_in_dict(dict_metrics_cause, "extratrees", calculate_metrics(y_test, model_cause_extratrees.predict(X_test)))
add_value_in_dict(dict_metrics_cause, "randomforest", calculate_metrics(y_test, model_cause_randomforest.predict(X_test)))



In [33]:
dict_metrics_cause

{'bagging': {'r2_score': -0.013659792648185934,
  'mean_absolute_error': 8.937787399905917,
  'mean_squared_error': 148.2577293229293},
 'extratrees': {'r2_score': 0.04000825945446127,
  'mean_absolute_error': 8.656157436636063,
  'mean_squared_error': 139.7035674434797},
 'randomforest': {'r2_score': 0.008658729290887623,
  'mean_absolute_error': 8.859132489640537,
  'mean_squared_error': 144.7905826982563}}

Le modèle donnant les meilleurs résultats est celui de l'ExtraTrees