### Importation des librairies et des fichiers :

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
X_test_multi = pd.read_csv("X_test_bi.csv")
X_train_multi = pd.read_csv("X_train_bi.csv")

In [3]:
X_test_multi.head()
X_test_multi.drop(columns = "Unnamed: 0", inplace = True)

In [4]:
X_train_multi.head()
X_train_multi.drop(columns = "Unnamed: 0", inplace = True)

In [5]:
X_test_multi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308327 entries, 0 to 308326
Data columns (total 23 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   Easting_rounded                                     308327 non-null  float64
 1   Northing_rounded                                    308327 non-null  float64
 2   DeployedFromStation_Name                            308327 non-null  float64
 3   Meteo_encoded                                       308327 non-null  float64
 4   Visibility_encoded                                  308327 non-null  float64
 5   StopCodeDescription_Chimney Fire                    308327 non-null  float64
 6   StopCodeDescription_False alarm - Good intent       308327 non-null  float64
 7   StopCodeDescription_False alarm - Malicious         308327 non-null  float64
 8   StopCodeDescription_Late Call                       308327 non-n

In [7]:
X_train_multi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1233308 entries, 0 to 1233307
Data columns (total 23 columns):
 #   Column                                              Non-Null Count    Dtype  
---  ------                                              --------------    -----  
 0   Easting_rounded                                     1233308 non-null  float64
 1   Northing_rounded                                    1233308 non-null  float64
 2   DeployedFromStation_Name                            1233308 non-null  float64
 3   Meteo_encoded                                       1233308 non-null  float64
 4   Visibility_encoded                                  1233308 non-null  float64
 5   StopCodeDescription_Chimney Fire                    1233308 non-null  float64
 6   StopCodeDescription_False alarm - Good intent       1233308 non-null  float64
 7   StopCodeDescription_False alarm - Malicious         1233308 non-null  float64
 8   StopCodeDescription_Late Call                       

In [8]:
y_test_reg = pd.read_csv("y_test_reg.csv")
y_train_reg = pd.read_csv("y_train_reg.csv")

In [14]:
y_test_reg.head()
y_test_reg.drop(columns = "Unnamed: 0", inplace = True)

In [17]:
y_train_reg.head()
y_train_reg.drop(columns = "Unnamed: 0", inplace = True)

In [19]:
y_test_reg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308327 entries, 0 to 308326
Data columns (total 1 columns):
 #   Column                            Non-Null Count   Dtype
---  ------                            --------------   -----
 0   FirstPumpArriving_AttendanceTime  308327 non-null  int64
dtypes: int64(1)
memory usage: 2.4 MB


In [21]:
y_train_reg.head()

Unnamed: 0,FirstPumpArriving_AttendanceTime
0,354
1,155
2,445
3,266
4,524


### Création des intervalles de notre variable cible :

In [23]:
y_test_multi = pd.cut(y_test_reg.iloc[:,0], bins = [0,240,300,390, 780], labels = ["4min ou moins", "4 à 5min","5 à 6min30","6min30 à 13minutes"])

#### Vérification de notre équilibre de classe dans notre variable cible :

In [25]:
y_test_multi.value_counts()

FirstPumpArriving_AttendanceTime
4min ou moins         88358
5 à 6min30            81479
4 à 5min              71536
6min30 à 13minutes    66954
Name: count, dtype: int64

In [27]:
y_train_multi = pd.cut(y_train_reg.iloc[:,0], bins = [0,240,300,390, 780], labels = ["4min ou moins", "4 à 5min","5 à 6min30","6min30 à 13minutes"])

In [29]:
y_train_multi.value_counts()

FirstPumpArriving_AttendanceTime
4min ou moins         353023
5 à 6min30            324916
4 à 5min              286944
6min30 à 13minutes    268425
Name: count, dtype: int64

## Optimisation du Decision Tree Classifier à l'aide d'un GridSearch et d'une validation croisée :

In [31]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10]}

grid = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid=param_grid, cv=5, scoring='f1_macro')

grid.fit(X_train_multi, y_train_multi)
print(grid.best_params_)
print(grid.best_score_)


{'max_depth': 20, 'min_samples_split': 10}
0.4002391975041949


In [33]:
# Modèle relancé avec les paramètres donnés par le GridSearchCV :
DC = DecisionTreeClassifier(random_state = 42, max_depth = 20, min_samples_split = 10)
DC.fit(X_train_multi, y_train_multi)

In [37]:
from sklearn.metrics import classification_report
predictionDC = DC.predict(X_test_multi)
print(classification_report(y_test_multi, predictionDC))
display(pd.crosstab(y_test_multi, predictionDC, rownames = ['Réalité'], colnames = ['Prediciton']))

                    precision    recall  f1-score   support

          4 à 5min       0.33      0.25      0.29     71536
     4min ou moins       0.49      0.62      0.55     88358
        5 à 6min30       0.36      0.39      0.38     81479
6min30 à 13minutes       0.47      0.36      0.41     66954

          accuracy                           0.42    308327
         macro avg       0.41      0.41      0.40    308327
      weighted avg       0.41      0.42      0.41    308327



Prediciton,4 à 5min,4min ou moins,5 à 6min30,6min30 à 13minutes
Réalité,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4min ou moins,13966,55139,13902,5351
4 à 5min,18068,26271,20478,6719
5 à 6min30,15096,19348,32110,14925
6min30 à 13minutes,8013,12206,22669,24066


### Création d'un nouveau jeu d'entrainement et de test en ne gardant que nos 6 variables les plus importantes définies précédemment :

In [31]:
X_train_multi_new = X_train_multi[['Easting_rounded', 'Northing_rounded', 'Meteo_encoded','DeployedFromStation_Name','Hour','Year']]
X_test_multi_new = X_test_multi[['Easting_rounded', 'Northing_rounded', 'Meteo_encoded','DeployedFromStation_Name','Hour','Year']]

## Modèle Décision Tree Classifier relancé uniquemen avec les 6 variables :

In [33]:
DC2 = DecisionTreeClassifier(random_state = 42)
DC2.fit(X_train_multi_new, y_train_multi)

In [37]:
from sklearn.metrics import classification_report
predictionDC2 = DC2.predict(X_test_multi_new)
print(classification_report(y_test_multi, predictionDC2))
display(pd.crosstab(y_test_multi, predictionDC2, rownames = ['Réalité'], colnames = ['Prediciton']))

                    precision    recall  f1-score   support

          4 à 5min       0.32      0.33      0.32     71536
     4min ou moins       0.54      0.54      0.54     88358
        5 à 6min30       0.36      0.36      0.36     81479
6min30 à 13minutes       0.43      0.42      0.42     66954

          accuracy                           0.42    308327
         macro avg       0.41      0.41      0.41    308327
      weighted avg       0.42      0.42      0.42    308327



Prediciton,4 à 5min,4min ou moins,5 à 6min30,6min30 à 13minutes
Réalité,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4min ou moins,20246,47429,12860,7823
4 à 5min,23703,19471,18594,9768
5 à 6min30,20020,13073,29117,19269
6min30 à 13minutes,10695,8439,19897,27923


## Modèle Decision Tree relancé avec le GridSearch et une validation croisée sur seulement nos 6 variables les plus importantes :

In [39]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import GridSearchCV

param_grid2 = {'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10]}

grid2 = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid=param_grid2,
    cv=5,
    scoring='f1_macro'
)

grid2.fit(X_train_multi_new, y_train_multi)

print("Meilleurs paramètres :", grid2.best_params_)
print("Meilleur F1-score macro :", grid2.best_score_)

Meilleurs paramètres : {'max_depth': 20, 'min_samples_split': 10}
Meilleur F1-score macro : 0.4177930498578671


In [43]:
from sklearn.metrics import classification_report
predictionDC3 = grid2.predict(X_test_multi_new)
print(classification_report(y_test_multi, predictionDC3))
display(pd.crosstab(y_test_multi, predictionDC3, rownames = ['Réalité'], colnames = ['Prediciton']))

                    precision    recall  f1-score   support

          4 à 5min       0.34      0.28      0.31     71536
     4min ou moins       0.51      0.65      0.57     88358
        5 à 6min30       0.38      0.41      0.39     81479
6min30 à 13minutes       0.51      0.37      0.43     66954

          accuracy                           0.44    308327
         macro avg       0.43      0.43      0.43    308327
      weighted avg       0.44      0.44      0.43    308327



Prediciton,4 à 5min,4min ou moins,5 à 6min30,6min30 à 13minutes
Réalité,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4min ou moins,14671,57325,12117,4245
4 à 5min,20171,25294,20374,5697
5 à 6min30,16342,17806,33383,13948
6min30 à 13minutes,8189,11030,23059,24676


## Gradient Boosting Classifier :

In [33]:
from sklearn.ensemble import GradientBoostingClassifier


In [35]:
gb_model = GradientBoostingClassifier(
    n_estimators=10, learning_rate=0.1, max_depth=5, random_state=42)

gb_model.fit(X_train_multi_new, y_train_multi)


In [37]:
from sklearn.metrics import classification_report, confusion_matrix

pred_GB = gb_model.predict(X_test_multi_new)

print(classification_report(y_test_multi, pred_GB))
print(confusion_matrix(y_test_multi, pred_GB))


                    precision    recall  f1-score   support

          4 à 5min       0.35      0.02      0.03     71536
     4min ou moins       0.37      0.66      0.48     88358
        5 à 6min30       0.32      0.41      0.36     81479
6min30 à 13minutes       0.39      0.24      0.29     66954

          accuracy                           0.35    308327
         macro avg       0.36      0.33      0.29    308327
      weighted avg       0.36      0.35      0.30    308327

[[ 1163 39458 24046  6869]
 [  776 58430 21965  7187]
 [ 1011 35420 33775 11273]
 [  363 23330 27301 15960]]


### Impossibilité de lancer ces deux cellules par manque de puissance de nos ordinateurs (modèle impossible à exploiter) : 

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(gb_model, X_train_multi_new, y_train_multi, cv=5, scoring='f1_macro')
print("Score F1-macro moyen :", scores.mean())


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5]
}

grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='f1_macro'
)

grid.fit(X_train_multi_new, y_train_multi)
print("Meilleurs paramètres :", grid.best_params_)
print("Meilleur score F1-macro :", grid.best_score_)
