In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


In [21]:
X_TEST = pd.read_csv('Xtest.csv')
X = pd.read_csv('Xtrain_hgcGIrA.csv')
Y = pd.read_csv('Ytrain_yL5OjS4.csv')

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=.8)

$\textbf{1)}$ Pré-traitement des données

In [23]:
test = pd.concat([X_test,y_test], axis = 1)
train = pd.concat([X_train,y_train],axis = 1)

In [24]:
cols_with_missings = [col for col in train.columns if train[col].isnull().any()]


In [25]:
train['hour'] = pd.to_datetime(train['hour'], format='%H:%M:%S')
train['hour'] = train['hour'].dt.hour

test['hour'] = pd.to_datetime(test['hour'], format='%H:%M:%S')
test['hour'] = test['hour'].dt.hour

X_TEST['hour'] = pd.to_datetime(X_TEST['hour'], format='%H:%M:%S')
X_TEST['hour'] = X_TEST['hour'].dt.hour

In [26]:
#Pour les stations on remplace les valeurs manquantes par 0
train[cols_with_missings[1:]] = train[cols_with_missings[1:]].fillna(0)
test[cols_with_missings[1:]] = test[cols_with_missings[1:]].fillna(0)
X_TEST[cols_with_missings[1:]] = X_TEST[cols_with_missings[1:]].fillna(0)

#Pour l'heure on prend la partie entière de la médiane
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')

train['hour'] = imputer.fit_transform(train[['hour']])
train['hour'] = train['hour'].astype(int)

test['hour'] = imputer.transform(test[['hour']])
test['hour'] = test['hour'].astype(int)

X_TEST['hour'] = imputer.transform(X_TEST[['hour']])
X_TEST['hour'] = X_TEST['hour'].astype(int)


In [27]:
#On va maintenant encoder la varible station en utilisant un simple label encoder
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

train['station'] = encoder.fit_transform(train['station']).astype(int)
test['station'] = encoder.transform(test['station']).astype(int)
X_TEST['station'] = encoder.transform(X_TEST['station']).astype(int)

$\textbf{2)}$ Sélection du modèle

In [28]:
#Séparation des variables explicatives et de la variable cible
explanatory_varibales = ['station', 'hour', 'p1q0', 'p2q0', 'p3q0', 'p0q1', 'p0q2', 'p0q3']
X_train = train.loc[:, explanatory_varibales]
X_test = test.loc[:, explanatory_varibales]
X_TEST = X_TEST.loc[:, explanatory_varibales]

y_train = train.loc[:,'p0q0']
y_test = test.loc[:,'p0q0']

In [29]:
#Import des différents modèles que nous allons tester

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_percentage_error, mean_absolute_error


In [30]:
models = {
    'SVR': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'K Neighbors': KNeighborsRegressor(),
    'Linear Regression': LinearRegression(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Random Forest' : RandomForestRegressor()
}



In [31]:
#Procédure d'évaluation des mdèles
import numpy as np

MAPE = make_scorer(mean_absolute_percentage_error)
MAE = make_scorer(mean_absolute_error)

def Evaluation(models, X_train, y_train, metric=MAPE):
    results = {}
    for name, model in models.items():
        score = cross_val_score(model, X_train, y_train, cv=5, scoring=metric)
        results[name] = {
            'mean_score': score.mean(),
            'std_score': score.std()
        }
        print(f"{name}: {results[name]['mean_score']:.2f} +/- {results[name]['std_score']:.2f}")
        break
    
    return results

In [32]:
Evaluation(models,  X_train, y_train, metric = MAE)

SVR: 0.05 +/- 0.00


{'SVR': {'mean_score': 0.0470382056938425,
  'std_score': 0.00010939997219629958}}

$\textbf{3)}$ Optimisation du modèle

In [33]:
import optuna
from optuna import trial

def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
    }

    # Instanciation et entraînement du modèle
    model = GradientBoostingRegressor(**param, random_state=0)
    model.fit(X_train.iloc[:,1:], y_train)
    y_pred = model.predict(X_test.iloc[:,1:])

    error = mean_absolute_error(y_test, y_pred)

    return error

study = optuna.create_study()  
study.optimize(objective, n_trials=20, n_jobs = -1) 

[I 2023-11-12 19:59:20,964] A new study created in memory with name: no-name-0dda9e92-a370-4b51-bf4e-340beae83e71
[I 2023-11-12 19:59:44,824] Trial 2 finished with value: 0.018288788763413028 and parameters: {'n_estimators': 159, 'learning_rate': 0.10629127546981113, 'max_depth': 5, 'min_samples_split': 10}. Best is trial 2 with value: 0.018288788763413028.
[I 2023-11-12 19:59:53,755] Trial 3 finished with value: 0.014155238988268475 and parameters: {'n_estimators': 108, 'learning_rate': 0.14721131878892732, 'max_depth': 9, 'min_samples_split': 2}. Best is trial 3 with value: 0.014155238988268475.
[I 2023-11-12 20:00:03,824] Trial 4 finished with value: 0.023915766745867845 and parameters: {'n_estimators': 317, 'learning_rate': 0.01995058723840638, 'max_depth': 4, 'min_samples_split': 9}. Best is trial 3 with value: 0.014155238988268475.
[I 2023-11-12 20:00:04,449] Trial 0 finished with value: 0.014304712331838545 and parameters: {'n_estimators': 156, 'learning_rate': 0.146257058186087

In [None]:
display(study.best_trial)

FrozenTrial(number=6, state=1, values=[0.08763875388976404], datetime_start=datetime.datetime(2023, 11, 12, 0, 27, 53, 833251), datetime_complete=datetime.datetime(2023, 11, 12, 0, 28, 12, 382688), params={'n_estimators': 190, 'learning_rate': 0.22888401338267433, 'max_depth': 3, 'min_samples_split': 8}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=500, log=False, low=100, step=1), 'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'min_samples_split': IntDistribution(high=10, log=False, low=2, step=1)}, trial_id=6, value=None)

In [None]:
params = study.best_params

{'n_estimators': 190,
 'learning_rate': 0.22888401338267433,
 'max_depth': 3,
 'min_samples_split': 8}

In [None]:
model = GradientBoostingRegressor(**params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f'Erreur finale sur le jeu de test issue de train_test_split: {mean_absolute_error(y_test, y_pred):.2f}')

Erreur finale sur le jeu de test: 0.09


$\textbf{4)}$ Entrainement sur le jeu tout entier

In [None]:
X_train_full = pd.concat([X_train,X_test], axis = 0)
y_train_full = pd.concat([y_train,y_test], axis = 0)

model = GradientBoostingRegressor(**params)
model.fit(X_train_full, y_train_full)
predictions = model.predict(X_TEST)

In [None]:
soumission = pd.DataFrame({"p0q0" : predictions })

soumission.index = np.arange(1,len(soumission)+1)
display(soumission)
soumission.to_csv("soumission.csv")