# Importation des données

In [48]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import sklearn
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
# Donnes d'entrainement
data_train = pd.read_csv('/content/drive/MyDrive/ENPC/2A/HACKATHON 2/waiting_times_train.csv')

# Donnees test val
data_test_val = pd.read_csv('/content/drive/MyDrive/ENPC/2A/HACKATHON 2/waiting_times_X_test_val.csv')

# Donnees test final
data_test_final = pd.read_csv('/content/drive/MyDrive/ENPC/2A/HACKATHON 2/waiting_times_X_test_final.csv')

Pré-preocessing

In [50]:
def map_hour_to_moment(hour):
  """moment = period of the day (morning, lunch or afternoon)"""
  if hour <= 11:
    return "morning"
  elif hour <= 12:
    return "morning2"
  elif hour <= 13:
    return "lunch"
  elif hour <= 14:
    return "lunch2"
  elif hour <= 17:
    return "afternoon"
  elif hour <= 19 :
    return "afternoon2"
  elif hour <= 21:
    return "evening1"
  else:
    return "evening2"

def preprocess_dataframe(df):
    df['DATETIME'] = pd.to_datetime(df['DATETIME'])
    df = pd.get_dummies(df, columns=['ENTITY_DESCRIPTION_SHORT'])

    df['samedi'] = (df['DATETIME'].dt.weekday == 5).astype(int)
    df['dimanche'] = (df['DATETIME'].dt.weekday == 6).astype(int)
    df['workday'] = (df['DATETIME'].dt.weekday <= 4).astype(int)

    df['month'] = df['DATETIME'].dt.month

    df['moment'] = df['DATETIME'].map(lambda x: map_hour_to_moment(x.hour))
    df = pd.get_dummies(df, columns=['moment'])

    df = df.drop(['DATETIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2'], axis=1)

    return df

data_train = preprocess_dataframe(data_train)
data_test_val = preprocess_dataframe(data_test_val)
data_test_final = preprocess_dataframe(data_test_final)

In [51]:
# On scale les donnees
from sklearn.preprocessing import StandardScaler

continuous_columns = data_train.select_dtypes(include=['float64', 'int64']).columns
continuous_columns = continuous_columns.drop(['workday', 'dimanche', 'samedi'])
continuous_columns = continuous_columns.drop('WAIT_TIME_IN_2H')

# On configure le scaler sur l'ensemble d'entrainement
scaler = StandardScaler()
scaler.fit(data_train[continuous_columns], scaler)

data_train[continuous_columns] = scaler.transform(data_train[continuous_columns])
data_test_val[continuous_columns] = scaler.transform(data_test_val[continuous_columns])
data_test_final[continuous_columns] = scaler.transform(data_test_final[continuous_columns])

##GridSearchCV avec XGBregressor


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [None]:
# Définition des hyperparamètres à ajuster
param_grid = {
    'learning_rate': [0.05, 0.2, 0.3],
    'max_depth': [3, 7, 10],
    'min_child_weight': [1, 2, 5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

model = xgb.XGBRegressor()

X = data_train.drop(columns=['WAIT_TIME_IN_2H'])
y = data_train['WAIT_TIME_IN_2H']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,
                                                    random_state=46)

# Recherche des meilleurs hyperparamètres à l'aide de la validation croisée
grid_search = GridSearchCV(model, param_grid, cv=5,
                           scoring='neg_mean_squared_error', verbose=2,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Meilleurs hyperparamètres:", grid_search.best_params_)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
y_pred_relu = np.maximum(y_pred, 0)

rmse = np.sqrt(mean_squared_error(y_test, y_pred_relu))
print("Root Mean Squared Error:", rmse)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Meilleurs hyperparamètres: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'subsample': 0.8}
Root Mean Squared Error: 8.198031909934146


In [52]:
y_pred_test_val = best_model.predict(data_test_val)
y_pred_relu_test_val = np.maximum(y_pred_test_val, 0)
y_pred_rounded_val = y_pred_relu_test_val
print(y_pred_rounded_val)

[36.2329   33.800632 32.133327 ... 23.395597 19.244501 16.682835]


In [53]:
y_pred_test_final = best_model.predict(data_test_final)
y_pred_relu_test_final = np.maximum(y_pred_test_final, 0)
y_pred_rounded_final = y_pred_relu_test_final
print(y_pred_rounded_final)

[41.536366 41.59948  24.541935 ...  9.219238 14.484668 24.01054 ]


##Mettre la sol sous le bon fourmat (en général) et la télécharger

In [56]:
#Création de notre solution associé à la test_val DataSet
unprocessed_data_test_val = pd.read_csv('/content/drive/MyDrive/ENPC/2A/HACKATHON 2/waiting_times_X_test_val.csv')
unprocessed_data_test_final = pd.read_csv('/content/drive/MyDrive/ENPC/2A/HACKATHON 2/waiting_times_X_test_final.csv')


def to_solution_format(data, predicted_wait_times, final=False):
    if final:
        key_value = "c57d53a31f68e864e929524b80c3dfe31190a5e431187fa12f"
    else:
      key_value = "Validation"
    sol = pd.DataFrame(
        {
            "DATETIME": data["DATETIME"],
            "ENTITY_DESCRIPTION_SHORT": data["ENTITY_DESCRIPTION_SHORT"],
            "y_pred": predicted_wait_times,
            "KEY": key_value,
        }
    )
    return sol

# Modifier le 1er argument de to_solution_format pour créer la solution souhaitée
sol_val = to_solution_format(unprocessed_data_test_val, y_pred_rounded_val)
sol_val.to_csv("groupeE_xg_boost_val.csv", sep=',', index=False, encoding='utf-8')

sol_final = to_solution_format(unprocessed_data_test_final, y_pred_rounded_final, final=True)
sol_final.to_csv("groupeE_xg_boost_final.csv", sep=',', index=False, encoding='utf-8')

In [57]:
#import pickle

#with open('best_model.pickle', 'wb') as handle:
#    pickle.dump(best_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#with open('best_model.pickle', 'rb') as handle:
#    loaded_model = pickle.load(handle)