In [33]:
import pandas as pd
import warnings
from skimpy import skim
import json



In [2]:
pd.set_option('display.max_columns', None)
pd.set_option("display.float_format", lambda x: f"{x :.2f}")
warnings.filterwarnings('ignore')

In [3]:
df_airplane = pd.read_csv('datasets_raw/aeronefs_2023-11-07.csv')
df_composants = pd.read_csv('datasets_raw/composants_2023-11-07.csv')
df_degradations = pd.read_csv('datasets_raw/degradations_2023-11-07.csv')
df_logs_vols = pd.read_csv('datasets_raw/logs_vols_2023-11-07.csv')

In [4]:
skim(df_airplane)

In [5]:
df_airplane.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ref_aero        232 non-null    object
 1   type_model      232 non-null    object
 2   debut_service   232 non-null    object
 3   last_maint      232 non-null    object
 4   en_maintenance  232 non-null    bool  
 5   end_maint       39 non-null     object
dtypes: bool(1), object(5)
memory usage: 9.4+ KB


In [6]:
#last_maint, debut_service and end_maint are object and not date columns
df_airplane['last_maint'] = pd.to_datetime(df_airplane['last_maint'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_airplane['end_maint'] = pd.to_datetime(df_airplane['end_maint'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_airplane['debut_service'] = pd.to_datetime(df_airplane['debut_service'], format='%Y-%m-%d', errors='coerce')
df_airplane.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ref_aero        232 non-null    object        
 1   type_model      232 non-null    object        
 2   debut_service   232 non-null    datetime64[ns]
 3   last_maint      220 non-null    datetime64[ns]
 4   en_maintenance  232 non-null    bool          
 5   end_maint       39 non-null     datetime64[ns]
dtypes: bool(1), datetime64[ns](3), object(2)
memory usage: 9.4+ KB


In [7]:
df_airplane.head()

Unnamed: 0,ref_aero,type_model,debut_service,last_maint,en_maintenance,end_maint
0,E175_4124,E175,2003-10-01,2023-11-01,False,NaT
1,E175_6334,E175,2010-02-18,2023-11-01,True,2023-11-08 14:13:00
2,B777_1214,B777,2003-10-21,2023-11-06,True,2023-11-13 14:13:00
3,CRJ900_0813,CRJ900,2002-08-01,2023-11-03,True,2023-11-10 14:13:00
4,A340_0268,A340,2003-01-12,2023-11-04,False,NaT


In [8]:
skim(df_composants)

In [11]:
df_composants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10440 entries, 0 to 10439
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ref_compo          10440 non-null  object 
 1   categorie          10440 non-null  object 
 2   aero               10440 non-null  object 
 3   desc               10440 non-null  object 
 4   lifespan           10440 non-null  int64  
 5   taux_usure_actuel  10440 non-null  float64
 6   cout               10440 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 571.1+ KB


In [12]:
df_composants.head()

Unnamed: 0,ref_compo,categorie,aero,desc,lifespan,taux_usure_actuel,cout
0,REAE175-E175_4124-0,Composants Critiques,E175_4124,Réacteur gauche,11950,69.23,19845
1,REAE175-E175_4124-1,Composants Critiques,E175_4124,Réacteur droit,14014,65.18,17416
2,SYSE175-E175_4124-2,Composants Critiques,E175_4124,Système de navigation,12519,17.98,16184
3,ORDE175-E175_4124-3,Composants Critiques,E175_4124,Ordinateur de vol,11612,26.15,18574
4,AUTE175-E175_4124-4,Composants Critiques,E175_4124,Autopilote,10655,11.47,18769


In [9]:
skim(df_degradations)

In [15]:
# mesure_date is object and not date column
df_degradations['measure_day'] = pd.to_datetime(df_degradations['measure_day'], format='%Y-%m-%d', errors='coerce')
df_degradations.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10530 entries, 0 to 10529
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   ref_deg           10530 non-null  object        
 1   linked_aero       10530 non-null  object        
 2   compo_concerned   10530 non-null  object        
 3   usure_nouvelle    10530 non-null  float64       
 4   measure_day       10395 non-null  datetime64[ns]
 5   need_replacement  10530 non-null  bool          
dtypes: bool(1), datetime64[ns](1), float64(1), object(3)
memory usage: 421.7+ KB


In [23]:
# there're duplicates in the degradations dataset
df2 = df_degradations[df_degradations.duplicated(subset=['compo_concerned'],keep=False)].sort_values('compo_concerned')
df2.head()
# looks like there duplicates or better mal functions in the reading of the composant


Unnamed: 0,ref_deg,linked_aero,compo_concerned,usure_nouvelle,measure_day,need_replacement
5690,D007032,B737_4325,AILB737-B737_4325-20,33.59,NaT,False
8390,D009374,B737_4325,AILB737-B737_4325-20,33.59,NaT,False
8345,D002442,B737_4325,AILB737-B737_4325-20,33.59,NaT,False
5645,D003529,B737_4325,AILB737-B737_4325-20,58.05,2023-11-07,False
8374,D004815,B737_4325,AUTB737-B737_4325-4,6.53,NaT,False


In [24]:
df3 = df_degradations[df_degradations.duplicated(subset=['ref_deg'],keep=False)].sort_values('ref_deg')
df3.head()
# duplicates in ref_deg but different airplane and composant. So every aiplane got his own ref_deg id's?

Unnamed: 0,ref_deg,linked_aero,compo_concerned,usure_nouvelle,measure_day,need_replacement
477,D000011,B757_0550,SYSB757-B757_0550-27,86.84,2023-11-02,False
1089,D000011,B737_3580,SYSB737-B737_3580-9,15.95,2023-11-06,False
9148,D000019,A320_3437,SYSA320-A320_3437-13,38.37,2023-11-06,False
8175,D000019,B767_4568,ECLB767-B767_4568-30,14.7,2023-11-07,False
10294,D000026,A380_6805,SYSA380-A380_6805-34,42.62,2023-11-06,False


In [29]:
df4 = df_degradations[(df_degradations['linked_aero'] == 'B757_0550') | (df_degradations['linked_aero'] == 'B737_3580')]
df4.head()
# so ref_deg can be the same but for different aiplanes. The only real duplicats are reading on the same composant and the date is missing


Unnamed: 0,ref_deg,linked_aero,compo_concerned,usure_nouvelle,measure_day,need_replacement
450,D001793,B757_0550,REAB757-B757_0550-0,14.72,2023-11-02,False
451,D004523,B757_0550,REAB757-B757_0550-1,72.69,2023-11-02,False
452,D003527,B757_0550,SYSB757-B757_0550-2,40.36,2023-11-02,False
453,D006591,B757_0550,ORDB757-B757_0550-3,37.35,2023-11-02,False
454,D000081,B757_0550,AUTB757-B757_0550-4,22.6,2023-11-02,False


In [10]:
skim(df_logs_vols)

In [30]:
df_logs_vols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ref_vol      53 non-null     object 
 1   aero_linked  53 non-null     object 
 2   jour_vol     53 non-null     object 
 3   time_en_air  53 non-null     float64
 4   sensor_data  53 non-null     object 
 5   etat_voyant  53 non-null     int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 2.6+ KB


In [31]:
# jour_vol isn't a date
df_logs_vols['jour_vol'] = pd.to_datetime(df_logs_vols['jour_vol'], format='%Y-%m-%d', errors='coerce')
df_logs_vols.info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   ref_vol      53 non-null     object        
 1   aero_linked  53 non-null     object        
 2   jour_vol     53 non-null     datetime64[ns]
 3   time_en_air  53 non-null     float64       
 4   sensor_data  53 non-null     object        
 5   etat_voyant  53 non-null     int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 2.6+ KB


In [34]:
# sensor_data is as json. I will create news columns for each sensor
df_logs_vols.join(pd.json_normalize(df_logs_vols['sensor_data']))



Unnamed: 0,ref_vol,aero_linked,jour_vol,time_en_air,sensor_data,etat_voyant,temp,pressure,vibrations
0,V02208691,B777_5710,2023-11-07,2.7,"{'temp': '-8.8°C', 'pressure': '927.7 hPa', 'v...",0,-8.8°C,927.7 hPa,4.836935382797082 m/s²
1,V03716150,CRJ900_0709,2023-11-07,3.8,"{'temp': '16.9°C', 'pressure': '1014.4 hPa', '...",0,16.9°C,1014.4 hPa,1.083647934711815 m/s²
2,V03712533,A350_4582,2023-11-07,3.7,"{'temp': '29.4°C', 'pressure': '852.2 hPa', 'v...",1,29.4°C,852.2 hPa,2.861985122376307 m/s²
3,V06801291,CRJ700_2483,2023-11-07,5.8,"{'temp': '-11.4°C', 'pressure': '985.2 hPa', '...",0,-11.4°C,985.2 hPa,1.359547843206768 m/s²
4,V01739581,A380_1452,2023-11-07,7.8,"{'temp': '-7.7°C', 'pressure': '921.6 hPa', 'v...",0,-7.7°C,921.6 hPa,3.741090838035594 m/s²
5,V03886779,E170_5343,2023-11-07,4.1,"{'temp': '15.8°C', 'pressure': '1010.8 hPa', '...",0,15.8°C,1010.8 hPa,2.5981147898549133 m/s²
6,V02858442,B767_1669,2023-11-07,8.3,"{'temp': '-6.1°C', 'pressure': '910.4 hPa', 'v...",0,-6.1°C,910.4 hPa,3.9809165461920157 m/s²
7,V08192956,CRJ900_0692,2023-11-07,2.5,"{'temp': '-9.7°C', 'pressure': '969.1 hPa', 'v...",0,-9.7°C,969.1 hPa,2.211716806672661 m/s²
8,V01973109,B767_4568,2023-11-07,1.7,"{'temp': '-6.3°C', 'pressure': '1066.4 hPa', '...",0,-6.3°C,1066.4 hPa,2.268539706213476 m/s²
9,V08332258,CRJ900_3909,2023-11-07,2.7,"{'temp': '-6.6°C', 'pressure': '910.9 hPa', 'v...",0,-6.6°C,910.9 hPa,1.5170070915705463 m/s²
