# Fin du prétraitement : des inputs de même longueur et prêts pour le CNN

Pour pouvoir exécuter notre réseau de neurone (CNN), nous avons besoin de traiter encore nos données. 

En effet, notre modèle va apprendre les caractéristiques générales de plusieurs vols entre deux évenèments (WW ou SV), pour chaque avion. Cependant, pour que nous puissions effectuer un réseau de neurones, nous avons besoin d'avoir des intervalles "de même longueur", c'est à dire avec le même nombre de vols pour chaque intervalles. Pour l'instant, certains avions ont plus de 9000 vols entre deux évènements tandis que d'autres en ont très peu, et c'est pourquoi nous devons les traiter.

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
#Importation des données pré-traitées
path_df = r'D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/pretraitement.csv'
safran=pd.read_csv(path_df ,sep=',', encoding='latin-1')

In [3]:
safran

Unnamed: 0.1,date,Unnamed: 0,engine_serial_number,engine_family,engine_series,cycles,cycles_counter,egt_margin,var_mot_1,flight_leg_hours,...,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,1,ESN_1,Engine_family_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0.857778,...,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,2,ESN_1,Engine_family_1,Engine_series_1,15.284274,15,0.792029,0.006330,0.794167,...,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,3,ESN_1,Engine_family_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0.736667,...,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,4,ESN_1,Engine_family_1,Engine_series_1,16.493874,17,0.702078,0.430174,0.802500,...,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,5,ESN_1,Engine_family_1,Engine_series_1,22.409543,18,0.645941,0.299420,0.817500,...,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,2911298,ESN_1369,Engine_family_1,Engine_series_6,34.993550,50,0.531868,-0.731730,2.654444,...,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,2911299,ESN_1369,Engine_family_1,Engine_series_6,35.190820,51,0.973045,0.364383,2.501667,...,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,2911301,ESN_1369,Engine_family_1,Engine_series_6,36.001950,53,0.800778,0.949444,2.165000,...,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,2911302,ESN_1369,Engine_family_1,Engine_series_6,36.182090,54,0.619281,-0.748008,2.536667,...,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


In [4]:
safran.columns

Index(['date', 'Unnamed: 0', 'engine_serial_number', 'engine_family',
       'engine_series', 'cycles', 'cycles_counter', 'egt_margin', 'var_mot_1',
       'flight_leg_hours', 'event_rank', 'egt_slope', 'SV_indicator',
       'SV_rank', 'Config_B_indicator', 'Config_B_rank', 'WW_indicator',
       'WW_rank', 'config_A', 'config_B', 'var_env_1', 'var_env_2',
       'var_env_3', 'var_env_4', 'var_env_5', 'Interpolate_egt_margin',
       'Interpolate_var_mot_1', 'Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope'],
      dtype='object')

In [5]:
#Pour plus de clarté, nous ne gardons les colonnes qui ont déja été travaillées et celles nécessaires au traitement des intervalles
safran_2 = safran[['date', "engine_serial_number",'engine_series', 'cycles', 'cycles_counter','Interpolate_egt_margin',
       'Interpolate_var_mot_1','event_rank', 'config_A', 'config_B','Interpolate_flight_leg_hours',
       'Interpolate_SV_rank', 'Interpolate_Config_B_rank',
       'Interpolate_WW_rank', 'Interpolate_var_env_1', 'Interpolate_var_env_2',
       'Interpolate_var_env_3', 'Interpolate_var_env_4',
       'Interpolate_var_env_5', 'Interpolate_egt_slope']]


In [6]:
safran_2

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,Interpolate_flight_leg_hours,Interpolate_SV_rank,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,0.857778,0.0,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,0.794167,0.0,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,0.736667,0.0,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,0.802500,0.0,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,0.817500,0.0,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,0,Config_A_3,Config_B_1,2.654444,0.0,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,0,Config_A_3,Config_B_1,2.501667,0.0,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,0,Config_A_3,Config_B_1,2.165000,0.0,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,0,Config_A_3,Config_B_1,2.536667,0.0,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968


## 1) Choix de la taille de l'intervalle

On utilise la variable **event_rank** pour identifier tous les intervalles pendant lequel aucun évènement n'a été effectué.

In [7]:
safran_ER = safran_2[["engine_serial_number", "event_rank", "Interpolate_egt_slope"]]

In [8]:
# On a un .count(), donc on peut prend n'importe quelle colonne pour vérifier combien il y a de vols
safran_ER.groupby(by=["engine_serial_number", "event_rank"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,event_rank,Unnamed: 2_level_1
ESN_1,0,4548
ESN_1,1,644
ESN_1,2,468
ESN_1,3,380
ESN_1,4,2200
...,...,...
ESN_998,1,140
ESN_998,2,32
ESN_999,0,466
ESN_999,1,444


- **Intervalles avec le moins de données**


Regardons maintenant les cas où l'on a le moins de données pour un même intervalle.

In [9]:
safran_ER.groupby(by=["engine_serial_number", "event_rank"]).count().sort_values(by="Interpolate_egt_slope").head(220)

Unnamed: 0_level_0,Unnamed: 1_level_0,Interpolate_egt_slope
engine_serial_number,event_rank,Unnamed: 2_level_1
ESN_454,5,1
ESN_255,14,1
ESN_35,15,1
ESN_32,17,1
ESN_653,6,1
...,...,...
ESN_1243,1,10
ESN_181,3,10
ESN_137,7,10
ESN_346,6,10


In [10]:
saf_ER_group = safran_ER.groupby(by=["engine_serial_number", "event_rank"]).count().sort_values(by="Interpolate_egt_slope")
saf_ER_group = saf_ER_group.rename(columns = {'Interpolate_egt_slope': 'Nb_vols_entre_event'})

#On a encore engine_serial_number et Interpolate_WW_rank en index, on les enlève
saf_ER_group = saf_ER_group.reset_index()

#On réindexe, pour avoir une colonne avec les identifiants de chaque intervalle, 
#qu'on appelle id_int pour identifiant intervalle
saf_ER_group['id_int'] = saf_ER_group.index
saf_ER_group.sort_values(by="Nb_vols_entre_event")
saf_ER_group

Unnamed: 0,engine_serial_number,event_rank,Nb_vols_entre_event,id_int
0,ESN_454,5,1,0
1,ESN_255,14,1,1
2,ESN_35,15,1,2
3,ESN_32,17,1,3
4,ESN_653,6,1,4
...,...,...,...,...
8434,ESN_1,4,2200,8434
8435,ESN_19,0,2241,8435
8436,ESN_250,8,2275,8436
8437,ESN_1,10,3568,8437


In [11]:
#De cette manière, on peut visualiser les intervalles où le nombre de vols est inférieur à un certain seuil
saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 100)]

Unnamed: 0,engine_serial_number,event_rank,Nb_vols_entre_event,id_int
0,ESN_454,5,1,0
1,ESN_255,14,1,1
2,ESN_35,15,1,2
3,ESN_32,17,1,3
4,ESN_653,6,1,4
...,...,...,...,...
1818,ESN_165,12,99,1818
1819,ESN_146,4,99,1819
1820,ESN_59,2,99,1820
1821,ESN_44,2,99,1821


In [12]:
print("Le nombre d'intervalles de temps avec moins de 25 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 25)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")
print("Le nombre d'intervalles de temps avec moins de 50 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 50)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")
print("Le nombre d'intervalles de temps avec moins de 100 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 100)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")
print("Le nombre d'intervalles de temps avec moins de 150 vols correspond à", round((saf_ER_group[(saf_ER_group.Nb_vols_entre_event < 150)].shape[0]/saf_ER_group.shape[0])*100,2), "% des intervalles")

Le nombre d'intervalles de temps avec moins de 25 vols correspond à 5.18 % des intervalles
Le nombre d'intervalles de temps avec moins de 50 vols correspond à 9.93 % des intervalles
Le nombre d'intervalles de temps avec moins de 100 vols correspond à 21.6 % des intervalles
Le nombre d'intervalles de temps avec moins de 150 vols correspond à 30.93 % des intervalles


In [13]:
#On merge pour avoir l'identifiant dans la base de données
safran_complete = pd.merge(safran_2, saf_ER_group, left_on = ["engine_serial_number", "event_rank"], right_on = ["engine_serial_number", "event_rank"], how="inner")
safran_complete

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,4548,8438
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450270,2022-10-26 07:39:15,ESN_1369,Engine_series_6,34.993550,50,0.531868,-0.731730,0,Config_A_3,Config_B_1,...,0.0,0.0,0.728625,-0.356159,-0.708817,0.0,-1.207762,0.000968,34,576
2450271,2022-10-26 11:36:39,ESN_1369,Engine_series_6,35.190820,51,0.973045,0.364383,0,Config_A_3,Config_B_1,...,0.0,0.0,0.421328,-0.351647,-0.077484,0.0,0.199678,0.000968,34,576
2450272,2022-10-27 03:55:34,ESN_1369,Engine_series_6,36.001950,53,0.800778,0.949444,0,Config_A_3,Config_B_1,...,0.0,0.0,1.092857,-0.351647,0.343405,0.0,-0.403511,0.000968,34,576
2450273,2022-10-27 07:33:12,ESN_1369,Engine_series_6,36.182090,54,0.619281,-0.748008,0,Config_A_3,Config_B_1,...,0.0,0.0,1.383080,-0.356159,-0.708817,0.0,-0.705105,0.000968,34,576


### 2) L'échantillonnage avec 100 vols par intervalles

Après avoir réfléchi pour conserver à la fois beaucoup d'intervalles différents, tout en conservant le plus de vols possibles au sein de ces intervalles, nous avons choisi de conserver les intervalles avec plus de 100 vols.

In [14]:
df_mauvais_100 = safran_complete[safran_complete["Nb_vols_entre_event"].between(0, 99)] # Les vols qu'on supprime
df_keep_100 = safran_complete[~safran_complete["Nb_vols_entre_event"].between(0,99)] #Les vols qu'on garde
df_keep_100
#On conserve uniquement les intervalles où on a plus de 100 dans df_keep_100 données

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
0,2019-04-29 06:29:58,ESN_1,Engine_series_1,14.699402,14,0.881646,-0.313549,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.261068,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
1,2019-04-29 08:10:00,ESN_1,Engine_series_1,15.284274,15,0.792029,0.006330,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.064202,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
2,2019-04-29 09:55:00,ESN_1,Engine_series_1,15.898185,16,0.706729,-0.286324,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.292673,0.193871,0.764293,0.0,0.149412,-0.029193,4548,8438
3,2019-04-29 11:36:53,ESN_1,Engine_series_1,16.493874,17,0.702078,0.430174,0,Config_A_1,Config_B_1,...,0.0,0.0,0.070056,0.273855,1.500848,0.0,-1.056965,-0.029193,4548,8438
4,2019-04-30 04:28:40,ESN_1,Engine_series_1,22.409543,18,0.645941,0.299420,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.463185,0.193871,0.448627,0.0,0.601803,-0.029193,4548,8438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450219,2022-10-27 03:49:41,ESN_1360,Engine_series_1,254.604500,274,0.919735,0.869409,0,Config_A_3,Config_B_1,...,0.0,0.0,0.064519,-0.368762,0.238182,0.0,0.451006,-0.004884,210,3443
2450220,2022-10-27 12:43:08,ESN_1360,Engine_series_1,256.982600,275,0.686130,0.279454,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.223848,-0.311284,-0.919261,0.0,-1.056965,-0.004884,210,3443
2450221,2022-10-27 15:30:42,ESN_1360,Engine_series_1,257.729600,276,0.721306,0.346712,0,Config_A_3,Config_B_1,...,0.0,0.0,-0.334779,-0.303992,-1.971483,0.0,0.551538,-0.004884,210,3443
2450222,2022-10-27 18:33:59,ESN_1360,Engine_series_1,258.546700,277,0.675019,0.313983,0,Config_A_3,Config_B_1,...,0.0,0.0,1.174560,-0.311284,0.132960,0.0,-2.363873,-0.004884,210,3443


In [15]:
#ATTENTION cette cellule peut prendre 2-3 minutes à s'éxécuter
df_ech_100 = df_keep_100.groupby("id_int").sample(100) #On utilise sample(100) pour avoir 100 données aléatoires par intervalle
df_ech_100

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
568971,2022-10-10 03:07:37,ESN_139,Engine_series_1,4021.706000,3675,-2.293699,-1.234199,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.445732,-0.190762,-1.129705,0.0,1.154726,-0.027761,100,1823
568972,2022-10-10 08:11:06,ESN_139,Engine_series_1,4022.806000,3676,-2.301393,-1.219844,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.484910,-0.355901,1.185182,0.0,-0.805636,-0.027761,100,1823
569017,2022-10-19 06:18:51,ESN_139,Engine_series_1,4087.887000,3725,-2.358055,-0.666268,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.557611,-0.190762,-1.445372,0.0,1.154726,-0.027761,100,1823
569041,2022-10-23 07:26:53,ESN_139,Engine_series_1,4128.604000,3751,-2.226996,-0.446696,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.554110,-0.320570,0.238182,0.0,-0.755371,-0.027761,100,1823
569040,2022-10-23 04:36:47,ESN_139,Engine_series_1,4127.038000,3750,-2.230821,-0.453104,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.552936,-0.190762,-0.603595,0.0,0.903397,-0.027761,100,1823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,2019-06-27 16:01:06,ESN_1,Engine_series_1,517.216787,525,-0.009204,-0.541326,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.487157,0.236917,0.448627,0.0,-1.861216,-0.029193,4548,8438
1994,2019-08-15 09:02:59,ESN_1,Engine_series_1,886.153130,898,-0.428497,-0.793369,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.557007,0.056626,-0.814039,0.0,-1.911482,-0.029193,4548,8438
1402,2019-05-29 20:35:28,ESN_1,Engine_series_1,276.746233,282,0.387651,0.615565,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.206949,0.236917,0.553849,0.0,-1.308293,-0.029193,4548,8438
437,2019-06-19 15:31:42,ESN_1,Engine_series_1,448.278051,457,0.104139,0.677757,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.190495,0.236917,0.238183,0.0,-0.705105,-0.029193,4548,8438


In [16]:
print("En effectuant cette méthode, on aura", df_ech_100.shape[0], "lignes dans notre base de données, alors que la base contenait", safran_2.shape[0], "lignes.")

En effectuant cette méthode, on aura 661600 lignes dans notre base de données, alors que la base contenait 2450275 lignes.


In [17]:
df_ech_100.sort_values(by=["id_int", "Interpolate_egt_slope"])

Unnamed: 0,date,engine_serial_number,engine_series,cycles,cycles_counter,Interpolate_egt_margin,Interpolate_var_mot_1,event_rank,config_A,config_B,...,Interpolate_Config_B_rank,Interpolate_WW_rank,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,Interpolate_egt_slope,Nb_vols_entre_event,id_int
568971,2022-10-10 03:07:37,ESN_139,Engine_series_1,4021.706000,3675,-2.293699,-1.234199,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.445732,-0.190762,-1.129705,0.0,1.154726,-0.027761,100,1823
568972,2022-10-10 08:11:06,ESN_139,Engine_series_1,4022.806000,3676,-2.301393,-1.219844,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.484910,-0.355901,1.185182,0.0,-0.805636,-0.027761,100,1823
569017,2022-10-19 06:18:51,ESN_139,Engine_series_1,4087.887000,3725,-2.358055,-0.666268,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.557611,-0.190762,-1.445372,0.0,1.154726,-0.027761,100,1823
569041,2022-10-23 07:26:53,ESN_139,Engine_series_1,4128.604000,3751,-2.226996,-0.446696,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.554110,-0.320570,0.238182,0.0,-0.755371,-0.027761,100,1823
569040,2022-10-23 04:36:47,ESN_139,Engine_series_1,4127.038000,3750,-2.230821,-0.453104,14,Config_A_2,Config_B_4,...,6.0,8.0,-0.552936,-0.190762,-0.603595,0.0,0.903397,-0.027761,100,1823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,2019-06-27 16:01:06,ESN_1,Engine_series_1,517.216787,525,-0.009204,-0.541326,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.487157,0.236917,0.448627,0.0,-1.861216,-0.029193,4548,8438
1994,2019-08-15 09:02:59,ESN_1,Engine_series_1,886.153130,898,-0.428497,-0.793369,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.557007,0.056626,-0.814039,0.0,-1.911482,-0.029193,4548,8438
1402,2019-05-29 20:35:28,ESN_1,Engine_series_1,276.746233,282,0.387651,0.615565,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.206949,0.236917,0.553849,0.0,-1.308293,-0.029193,4548,8438
437,2019-06-19 15:31:42,ESN_1,Engine_series_1,448.278051,457,0.104139,0.677757,0,Config_A_1,Config_B_1,...,0.0,0.0,-0.190495,0.236917,0.238183,0.0,-0.705105,-0.029193,4548,8438


In [18]:
df_ech_100.to_csv('D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/Echantillons100.csv')

### 3) Les derniers traitements numériques

Enfin, nous éluminons quelques variables : 
- **engine_serial_number** qui ne constitue pas une variable explicative 
- **engine_family** car dans notre base de donnés, tous les moteurs d'avions sont issus de la même famille de moteur
- **date** : car les informations sur la temporalité sont contenues dans cycle_counter
- **cycle** : pour la même raison que la variable "date"
- **event_rank** : car les informations que nous souhaitions ont déjà servi pour constituer la variable id_int qui identifie tous les intervalles
- SV_rank & Interpolate_WW_rank & Interpolate_Config_B_rank: elles sont prises en compte dans l'age du moteur 

In [19]:
#Echantillons100 = pd.read_csv(r'C:\Users\louis\OneDrive\Documents\ENSAE\2A\Safran\Echantillons100.csv')
Echantillons100 = pd.read_csv(r'D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/Echantillons100.csv')

In [20]:
X_Y = Echantillons100[[ "engine_series", "cycles_counter",'config_A', 'config_B', "Interpolate_var_mot_1",
                'Interpolate_flight_leg_hours', 'Interpolate_var_env_1',
                'Interpolate_var_env_2','Interpolate_var_env_3', 
                'Interpolate_var_env_4','Interpolate_var_env_5', "id_int" , 'Interpolate_egt_slope']]

On classe les variables : 
- Catégorielles : 
    - engine_series
    - config_A
    - config_B
    - Interpolate_var_env_4
- Continues :
    - cycles_counter
    - Interpolate_var_mot_1
    - Interpolate_flight_leg_hours
    - Interpolate_var_env_1 
    - Interpolate_var_env_2
    - Interpolate_var_env_3
    - Interpolate_var_env_5
    
On traite différemment ces deux types de variables.

### Variables continues :

On traite les variables continues de manière à ce qu'elles soient toutes dans un même ordre de grandeur. 

In [21]:
X_Y[['cycles_counter', 'Interpolate_var_mot_1', 'Interpolate_flight_leg_hours' , 'Interpolate_var_env_1', 'Interpolate_var_env_2', 'Interpolate_var_env_3', 'Interpolate_var_env_5']].describe()

Unnamed: 0,cycles_counter,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_5
count,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0
mean,1651.444649,0.003127,2.130447,0.02435,-0.013693,-0.014204,0.037355
std,1267.3903,0.936138,1.185925,1.000114,0.959672,1.021932,0.973502
min,0.0,-3.178409,-0.506389,-0.609974,-0.37173,-5.233369,-3.57025
25%,635.0,-0.50009,1.201944,-0.530147,-0.357955,-0.708817,-0.504042
50%,1379.0,0.210394,1.892778,-0.329175,-0.316066,0.238182,0.199678
75%,2394.0,0.691026,2.761667,0.15835,-0.135441,0.764293,0.7526
max,7671.0,3.691059,13.90778,56.96611,21.16301,3.710513,21.763662


Toutes les variables ont été traitées en amont par Safran. Les échelles de grandeur sont similaires pour toutes les variables sauf pour l'age du moteur : cycles_counter. Nous applicons une fonction log pour réduire l'impact de ces données dans le CNN. 

In [22]:
# Moteurs problématiques pour appliquer la loss :
X_Y[X_Y['cycles_counter']==0]

Unnamed: 0,engine_series,cycles_counter,config_A,config_B,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int,Interpolate_egt_slope
202074,Engine_series_1,0,Config_A_2,Config_B_1,1.042011,5.660556,0.274045,-0.356068,-1.234928,0.0,0.099146,3843,-0.014767
328633,Engine_series_1,0,Config_A_3,Config_B_1,-1.571258,2.81,-0.21815,-0.351178,0.869515,0.0,-0.353245,5109,-0.01229
487725,Engine_series_3,0,Config_A_2,Config_B_1,0.662787,1.578889,-0.39671,-0.29794,-0.603595,0.0,0.149412,6700,-0.022738


In [23]:
# On remplace les moteurs dont l'age est 0 par 1 
X_Y['cycles_counter'] = X_Y['cycles_counter'].replace(0,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
X_Y.cycles_counter = np.log(X_Y.cycles_counter )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [25]:
X_Y.describe()

Unnamed: 0,cycles_counter,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_4,Interpolate_var_env_5,id_int,Interpolate_egt_slope
count,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0
mean,7.002936,0.003127,2.130447,0.02435,-0.013693,-0.014204,0.114876,0.037355,5130.5,-0.033763
std,1.061285,0.936138,1.185925,1.000114,0.959672,1.021932,0.40524,0.973502,1909.876112,0.190287
min,0.0,-3.178409,-0.506389,-0.609974,-0.37173,-5.233369,0.0,-3.57025,1823.0,-5.852407
25%,6.453625,-0.50009,1.201944,-0.530147,-0.357955,-0.708817,0.0,-0.504042,3476.75,-0.042398
50%,7.229114,0.210394,1.892778,-0.329175,-0.316066,0.238182,0.0,0.199678,5130.5,-0.025694
75%,7.780721,0.691026,2.761667,0.15835,-0.135441,0.764293,0.0,0.7526,6784.25,-0.014029
max,8.945202,3.691059,13.90778,56.96611,21.16301,3.710513,3.0,21.763662,8438.0,9.27649


### Variables catégorielles : 

In [26]:
Cat = X_Y[['engine_series','config_A','config_B','Interpolate_var_env_4']]

In [27]:
Cat.Interpolate_var_env_4=Cat.Interpolate_var_env_4.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [28]:
Cat_dummies = pd.get_dummies(Cat[['engine_series','config_A','config_B','Interpolate_var_env_4']],drop_first=False)

In [29]:
Cat_dummies.describe()

Unnamed: 0,engine_series_Engine_series_1,engine_series_Engine_series_2,engine_series_Engine_series_3,engine_series_Engine_series_4,engine_series_Engine_series_5,engine_series_Engine_series_6,engine_series_Engine_series_7,config_A_Config_A_1,config_A_Config_A_2,config_A_Config_A_3,config_A_Config_A_4,config_A_Config_A_5,config_B_Config_B_1,config_B_Config_B_2,config_B_Config_B_3,config_B_Config_B_4,Interpolate_var_env_4_0.0,Interpolate_var_env_4_1.0,Interpolate_var_env_4_2.0,Interpolate_var_env_4_3.0
count,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0
mean,0.891049,0.001394,0.07174,0.031434,0.002872,0.000907,0.000605,0.577526,0.391764,0.030552,0.000157,2e-06,0.711814,0.021907,0.045914,0.220364,0.91417,0.059007,0.024599,0.002223
std,0.311578,0.037305,0.258057,0.174489,0.053512,0.030101,0.024581,0.493954,0.488145,0.1721,0.012537,0.001229,0.452919,0.146382,0.2093,0.414493,0.280113,0.235638,0.154901,0.047101
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
# On revient dans un format plus commun pour pouvoir assigner à -1 les valeurs 0
def replace_int8 (serie) : 
    for i in serie.columns : 
        serie[i] = np.int8(serie[ i])
        #serie[i] = serie[i].replace(0,-1) 
    return serie

In [31]:
Cat_final = replace_int8 (Cat_dummies) 

In [32]:
Cat_final.describe()

Unnamed: 0,engine_series_Engine_series_1,engine_series_Engine_series_2,engine_series_Engine_series_3,engine_series_Engine_series_4,engine_series_Engine_series_5,engine_series_Engine_series_6,engine_series_Engine_series_7,config_A_Config_A_1,config_A_Config_A_2,config_A_Config_A_3,config_A_Config_A_4,config_A_Config_A_5,config_B_Config_B_1,config_B_Config_B_2,config_B_Config_B_3,config_B_Config_B_4,Interpolate_var_env_4_0.0,Interpolate_var_env_4_1.0,Interpolate_var_env_4_2.0,Interpolate_var_env_4_3.0
count,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0
mean,0.891049,0.001394,0.07174,0.031434,0.002872,0.000907,0.000605,0.577526,0.391764,0.030552,0.000157,2e-06,0.711814,0.021907,0.045914,0.220364,0.91417,0.059007,0.024599,0.002223
std,0.311578,0.037305,0.258057,0.174489,0.053512,0.030101,0.024581,0.493954,0.488145,0.1721,0.012537,0.001229,0.452919,0.146382,0.2093,0.414493,0.280113,0.235638,0.154901,0.047101
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
Cont = X_Y.drop(['engine_series','config_A', 'config_B', 'Interpolate_var_env_4' ], axis=1)
Cont

Unnamed: 0,cycles_counter,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_5,id_int,Interpolate_egt_slope
0,8.209308,-1.234199,3.608333,-0.445732,-0.190762,-1.129705,1.154726,1823,-0.027761
1,8.209580,-1.219844,4.332500,-0.484910,-0.355901,1.185182,-0.805636,1823,-0.027761
2,8.222822,-0.666268,0.925000,-0.557611,-0.190762,-1.445372,1.154726,1823,-0.027761
3,8.229778,-0.446696,2.750556,-0.554110,-0.320570,0.238182,-0.755371,1823,-0.027761
4,8.229511,-0.453104,1.806667,-0.552936,-0.190762,-0.603595,0.903397,1823,-0.027761
...,...,...,...,...,...,...,...,...,...
661595,6.263398,-0.541326,0.929722,-0.487157,0.236917,0.448627,-1.861216,8438,-0.029193
661596,6.800170,-0.793369,2.673333,-0.557007,0.056626,-0.814039,-1.911482,8438,-0.029193
661597,5.641907,0.615565,1.478611,-0.206949,0.236917,0.553849,-1.308293,8438,-0.029193
661598,6.124683,0.677757,0.995833,-0.190495,0.236917,0.238183,-0.705105,8438,-0.029193


In [34]:
X_Y_treat = pd.concat([Cat_final, Cont],  axis=1)
X_Y_treat

Unnamed: 0,engine_series_Engine_series_1,engine_series_Engine_series_2,engine_series_Engine_series_3,engine_series_Engine_series_4,engine_series_Engine_series_5,engine_series_Engine_series_6,engine_series_Engine_series_7,config_A_Config_A_1,config_A_Config_A_2,config_A_Config_A_3,...,Interpolate_var_env_4_3.0,cycles_counter,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_5,id_int,Interpolate_egt_slope
0,1,0,0,0,0,0,0,0,1,0,...,0,8.209308,-1.234199,3.608333,-0.445732,-0.190762,-1.129705,1.154726,1823,-0.027761
1,1,0,0,0,0,0,0,0,1,0,...,0,8.209580,-1.219844,4.332500,-0.484910,-0.355901,1.185182,-0.805636,1823,-0.027761
2,1,0,0,0,0,0,0,0,1,0,...,0,8.222822,-0.666268,0.925000,-0.557611,-0.190762,-1.445372,1.154726,1823,-0.027761
3,1,0,0,0,0,0,0,0,1,0,...,0,8.229778,-0.446696,2.750556,-0.554110,-0.320570,0.238182,-0.755371,1823,-0.027761
4,1,0,0,0,0,0,0,0,1,0,...,0,8.229511,-0.453104,1.806667,-0.552936,-0.190762,-0.603595,0.903397,1823,-0.027761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661595,1,0,0,0,0,0,0,1,0,0,...,0,6.263398,-0.541326,0.929722,-0.487157,0.236917,0.448627,-1.861216,8438,-0.029193
661596,1,0,0,0,0,0,0,1,0,0,...,0,6.800170,-0.793369,2.673333,-0.557007,0.056626,-0.814039,-1.911482,8438,-0.029193
661597,1,0,0,0,0,0,0,1,0,0,...,0,5.641907,0.615565,1.478611,-0.206949,0.236917,0.553849,-1.308293,8438,-0.029193
661598,1,0,0,0,0,0,0,1,0,0,...,0,6.124683,0.677757,0.995833,-0.190495,0.236917,0.238183,-0.705105,8438,-0.029193


In [35]:
X_Y_treat.describe()

Unnamed: 0,engine_series_Engine_series_1,engine_series_Engine_series_2,engine_series_Engine_series_3,engine_series_Engine_series_4,engine_series_Engine_series_5,engine_series_Engine_series_6,engine_series_Engine_series_7,config_A_Config_A_1,config_A_Config_A_2,config_A_Config_A_3,...,Interpolate_var_env_4_3.0,cycles_counter,Interpolate_var_mot_1,Interpolate_flight_leg_hours,Interpolate_var_env_1,Interpolate_var_env_2,Interpolate_var_env_3,Interpolate_var_env_5,id_int,Interpolate_egt_slope
count,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,...,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0,661600.0
mean,0.891049,0.001394,0.07174,0.031434,0.002872,0.000907,0.000605,0.577526,0.391764,0.030552,...,0.002223,7.002936,0.003127,2.130447,0.02435,-0.013693,-0.014204,0.037355,5130.5,-0.033763
std,0.311578,0.037305,0.258057,0.174489,0.053512,0.030101,0.024581,0.493954,0.488145,0.1721,...,0.047101,1.061285,0.936138,1.185925,1.000114,0.959672,1.021932,0.973502,1909.876112,0.190287
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-3.178409,-0.506389,-0.609974,-0.37173,-5.233369,-3.57025,1823.0,-5.852407
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.453625,-0.50009,1.201944,-0.530147,-0.357955,-0.708817,-0.504042,3476.75,-0.042398
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,7.229114,0.210394,1.892778,-0.329175,-0.316066,0.238182,0.199678,5130.5,-0.025694
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,7.780721,0.691026,2.761667,0.15835,-0.135441,0.764293,0.7526,6784.25,-0.014029
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,8.945202,3.691059,13.90778,56.96611,21.16301,3.710513,21.763662,8438.0,9.27649


In [36]:
#X_Y_treat.to_csv('C:/Users/louis/OneDrive/Documents/ENSAE/2A/Safran/X_Y_treat.csv')
X_Y_treat.to_csv(r'D:/Données/ENSAE/2A/S2/Séminaire de modélisation statistique/X_Y_treat.csv')