# Introduction

Après avoir parcouru le fichier, nous allons commencer à le modifier

# Import des modules

In [None]:
import pandas as pd

# Lecture des données

In [None]:
df = pd.read_csv("datas.csv", index_col=0)

# Quelques rappels

### 10 première lignes

In [None]:
df.head(10)

### 10 dernière lignes

In [None]:
df.tail(10)

### Liste des colonnes

In [None]:
df.columns

### Dimensions du tableau

In [None]:
df.shape

### Type des données

In [None]:
df.dtypes

### Décompte des valeurs présentes

In [None]:
df.count()

### Première infos statistiques

In [None]:
df.describe()

### Mémoire occupée par le DataFrame

In [None]:
df.memory_usage() / 1024 / 1024

In [None]:
df.memory_usage().sum() / 1024 / 1024

# Suppression conditionnelle de lignes
Les vols déviés de leurs plans d'origine sont très rares et ne nous intéressent pas ici. on choisit donc de les retirer de notre DataFrame. On utilise pour cela l'extration conditionnelle

In [None]:
df.Diverted.value_counts()

In [None]:
df = df[df.Diverted == 0]

In [None]:
df.shape

### Exercice : faire la même chose avec les vols annulés

In [None]:
df.Cancelled.value_counts()

In [None]:
df = df[df.Cancelled == 0]

In [None]:
df.shape

# Suppression de colonnes
Certaines colonnes sont devenues inutiles du fait de notre extraction conditionnelle précédente

### Exercice : supprimer toutes les colonnes inutiles ainsi que taxiIn et taxiOut

In [None]:
import pandas as pd
df = pd.read_csv("datas.csv", index_col=0)
df = df[df.Cancelled == 0]
df = df[df.Diverted == 0]

In [None]:
df.drop(['Month','Year','Cancelled','Diverted','TaxiIn','TaxiOut','CancellationCode'],axis=1,inplace=True)
df.head()

# Mémoire occupée
On regarde l'évolution de la mémoire occupée par le DataFrame suite à la suppression de plusieurs colonnes

In [None]:
df.memory_usage().sum() / 1024 / 1024

# Optimisation des types de variable
La méthode astype nous permet de changer le dtype d'une variable, ceci afin d'économiser de la mémoire et potentiellement accélérer certaines opérations

### exemple de dayOfMonth

In [1]:
import pandas as pd
df = pd.read_csv("datas.csv", index_col=0)
df = df[df.Cancelled == 0]
df = df[df.Diverted == 0]

In [2]:
df.drop(['Month','Year','Cancelled','Diverted','TaxiIn','TaxiOut','CancellationCode'],axis=1,inplace=True)
df.head()

Unnamed: 0,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,ArrDelay,DepDelay,Origin,Dest,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
Vol_0,8,1,705.0,700,917.0,915,US,560,N405US,132.0,...,2.0,5.0,PBI,DCA,857,0,0,0,0,0
Vol_1,10,3,705.0,700,920.0,915,US,560,N784AU,135.0,...,5.0,5.0,PBI,DCA,857,0,0,0,0,0
Vol_2,12,5,700.0,700,922.0,915,US,560,N421US,142.0,...,7.0,0.0,PBI,DCA,857,0,0,0,0,0
Vol_3,14,7,700.0,700,910.0,915,US,560,N435US,130.0,...,-5.0,0.0,PBI,DCA,857,0,0,0,0,0
Vol_4,16,2,724.0,700,921.0,915,US,560,N426US,117.0,...,6.0,24.0,PBI,DCA,857,0,0,0,0,0


In [3]:
df.DayofMonth.memory_usage() / 1024 / 1024

4.53680419921875

In [4]:
print(df.DayofMonth.min(), df.DayofMonth.max())

1 31


In [5]:
df.DayofMonth = df.DayofMonth.astype('int8')
#changer de type , passer de int64 a int8 ( 2^64)/2 < <-2^64) /2 )

In [6]:
print(df.DayofMonth.head(), df.DayofMonth.tail())

Vol_0     8
Vol_1    10
Vol_2    12
Vol_3    14
Vol_4    16
Name: DayofMonth, dtype: int8 Vol_301455    31
Vol_301456    31
Vol_301457    31
Vol_301458    31
Vol_301459    31
Name: DayofMonth, dtype: int8


In [7]:
df.DayofMonth.memory_usage() / 1024 / 1024

2.551952362060547

### appliquer à DayOfWeek

In [8]:
print(df.DayOfWeek.min(), df.DayOfWeek.max())

1 7


In [9]:
df.DayOfWeek = df.DayOfWeek.astype('int8')
#changer de type , passer de int64 a int8 ( 2^64)/2 < <-2^64) /2 )

In [10]:
print(df.DayOfWeek.head(), df.DayOfWeek.tail())

Vol_0    1
Vol_1    3
Vol_2    5
Vol_3    7
Vol_4    2
Name: DayOfWeek, dtype: int8 Vol_301455    3
Vol_301456    3
Vol_301457    3
Vol_301458    3
Vol_301459    3
Name: DayOfWeek, dtype: int8


### appliquer à DepTime, ArrTime, CRSDepTime et CRSArrTime (attention, il y a un piège !)

In [11]:
print(df.CRSArrTime.min(), df.CRSArrTime.max())

1 2359


In [12]:
df.DepTime = df.DepTime.astype('int16')

In [13]:
print(df.DepTime.head(), df.DepTime.tail())

Vol_0    705
Vol_1    705
Vol_2    700
Vol_3    700
Vol_4    724
Name: DepTime, dtype: int16 Vol_301455    1838
Vol_301456     640
Vol_301457    1222
Vol_301458    1830
Vol_301459    1615
Name: DepTime, dtype: int16


In [14]:
df.ArrTime = df.ArrTime.astype('int16')

In [15]:
df.CRSDepTime = df.CRSDepTime.astype('int16')

In [16]:
df.CRSArrTime = df.CRSArrTime.astype('int16')

In [17]:
df.memory_usage().sum() /1024/1024

41.398338317871094

# Mémoire occupée
On regarde une dernière fois la mémoire occupée par le DataFrame suite aux changements de dtype

In [None]:
df.memory_usage().sum() / 1024 / 1024

# Création de colonnes
On souhaite créer une colonne renseignant, à partir des différents types de retard, le retard total de chaque vol

### Exercice : Chercher comment créer la colonne du retard total (3 méthodes possibles)

### méthode 1

In [None]:
df['TotalDelay'] = df.CarrierDelay+df.NASDelay

In [None]:
df.head()

In [None]:
df.tail

### méthode 2

In [None]:
df['TotalDelay2'] = df[['CarrierDelay','NASDelay','SecurityDelay']].sum(axis=1)

In [None]:
df.head()

In [None]:
df.tail()

### méthode 3

In [18]:
df['TotalDelay'] = [x+y+z+t+u for x, y, z, t, u in zip(df.CarrierDelay,df.NASDelay,df.LateAircraftDelay,df.SecurityDelay,df.WeatherDelay)]

In [None]:
df.head()

In [None]:
df.tail()

### Exercice : Chercher comment créer la colonne caractérisant si l'avion est en retard ou non (3 méthodes possibles)

### méthode 1

In [19]:
df['Delayed'] = [1 if x>0 else 0 for x in df.TotalDelay]

In [21]:
df.Delayed.dtypes

dtype('int64')

### méthode 2

In [20]:
df['Delayed2'] = df.TotalDelay>0

In [22]:
df.Delayed2.dtypes

dtype('bool')

### méthode 3

In [23]:
df['Delayed3'] = df[['CarrierDelay','NASDelay','SecurityDelay','LateAircraftDelay','WeatherDelay']].any(axis=1)

In [24]:
df.Delayed3.dtypes

dtype('bool')

In [25]:
df.tail()

Unnamed: 0,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,TotalDelay,Delayed,Delayed2,Delayed3
Vol_301455,31,3,1838,1842,1943,1945,DL,384,N997DL,65.0,...,215,0,0,0,0,0,0,0,False,False
Vol_301456,31,3,640,645,1307,1318,DL,385,N3745B,207.0,...,1569,0,0,0,0,0,0,0,False,False
Vol_301457,31,3,1222,1155,1422,1345,DL,387,N917DL,120.0,...,576,27,0,10,0,0,37,1,True,True
Vol_301458,31,3,1830,1800,2057,2037,DL,389,N988DL,147.0,...,859,0,0,17,0,3,20,1,True,True
Vol_301459,31,3,1615,1615,1755,1804,DL,392,N6700,100.0,...,581,0,0,0,0,0,0,0,False,False


# Supprimer les colonnes inutiles parmi celles que l'on vient de créer
TotalDelay2, TotalDelay3, Delayed2 et Delayed3 sont inutiles car identiques à TotalDelay et Delayed

In [29]:
df.drop(['Delayed2', 'Delayed3'], axis=1, inplace=True)

In [30]:
df.head()

Unnamed: 0,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,...,Origin,Dest,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,TotalDelay,Delayed
Vol_0,8,1,705,700,917,915,US,560,N405US,132.0,...,PBI,DCA,857,0,0,0,0,0,0,0
Vol_1,10,3,705,700,920,915,US,560,N784AU,135.0,...,PBI,DCA,857,0,0,0,0,0,0,0
Vol_2,12,5,700,700,922,915,US,560,N421US,142.0,...,PBI,DCA,857,0,0,0,0,0,0,0
Vol_3,14,7,700,700,910,915,US,560,N435US,130.0,...,PBI,DCA,857,0,0,0,0,0,0,0
Vol_4,16,2,724,700,921,915,US,560,N426US,117.0,...,PBI,DCA,857,0,0,0,0,0,0,0


# Supprimer les différentes colonnes détaillant le retard
On ne s'intéresse qu'au retard total, on peut donc supprimer les colonnes détaillant les raisons du retard

In [31]:
df.drop(['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'], axis=1, inplace=True)

In [33]:
df.tail()

Unnamed: 0,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TotalDelay,Delayed
Vol_301455,31,3,1838,1842,1943,1945,DL,384,N997DL,65.0,63.0,37.0,-2.0,-4.0,SAV,ATL,215,0,0
Vol_301456,31,3,640,645,1307,1318,DL,385,N3745B,207.0,213.0,189.0,-11.0,-5.0,PHX,CVG,1569,0,0
Vol_301457,31,3,1222,1155,1422,1345,DL,387,N917DL,120.0,110.0,85.0,37.0,27.0,BWI,ATL,576,37,1
Vol_301458,31,3,1830,1800,2057,2037,DL,389,N988DL,147.0,157.0,113.0,20.0,30.0,BDL,ATL,859,20,1
Vol_301459,31,3,1615,1615,1755,1804,DL,392,N6700,100.0,109.0,80.0,-9.0,0.0,ATL,FLL,581,0,0


# Modifier une colonne
Les colonnes DepTime, CRSDepTime, ArrTime et CRSArrTime sont formatées de la façon suivant : hhmm. Par exemple, un DepTime de 724 correspond à 7h24 le matin. On souhaite modifier cette colonne pour avoir le temps en minutes. Par exemple, 7h24 devient 444.

### Exercice : Créer une colonne DepTime2 qui passe l'information de DepTime en minutes

In [39]:
df['DepTime2'] = [(x//100)*60+x%100 for x in  df.DepTime ]

In [40]:
df[['DepTime', 'DepTime2']].head()

Unnamed: 0,DepTime,DepTime2
Vol_0,705,425
Vol_1,705,425
Vol_2,700,420
Vol_3,700,420
Vol_4,724,444


In [None]:
df[['DepTime', 'DepTime2']].tail()

### Exercice : Supprimer la colonne DepTime devenue obsolète

In [48]:
df.drop(['DepTime'],axis=1, inplace=True)

KeyError: "labels ['DepTime'] not contained in axis"

Unnamed: 0,DayofMonth,DayOfWeek,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TotalDelay,Delayed,DepTime
Vol_301455,31,3,1842,1943,1945,DL,384,N997DL,65.0,63.0,37.0,-2.0,-4.0,SAV,ATL,215,0,0,1118
Vol_301456,31,3,645,1307,1318,DL,385,N3745B,207.0,213.0,189.0,-11.0,-5.0,PHX,CVG,1569,0,0,400
Vol_301457,31,3,1155,1422,1345,DL,387,N917DL,120.0,110.0,85.0,37.0,27.0,BWI,ATL,576,37,1,742
Vol_301458,31,3,1800,2057,2037,DL,389,N988DL,147.0,157.0,113.0,20.0,30.0,BDL,ATL,859,20,1,1110
Vol_301459,31,3,1615,1755,1804,DL,392,N6700,100.0,109.0,80.0,-9.0,0.0,ATL,FLL,581,0,0,975


### Exercice : Renommer la colonne DepTime2 en DepTime (Je vous laisse chercher !)

In [50]:
df.rename(columns={'DepTime2':'DepTime'},inplace=True)
df.tail()

Unnamed: 0,DayofMonth,DayOfWeek,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TotalDelay,Delayed,DepTime
Vol_301455,31,3,1842,1943,1945,DL,384,N997DL,65.0,63.0,37.0,-2.0,-4.0,SAV,ATL,215,0,0,1118
Vol_301456,31,3,645,1307,1318,DL,385,N3745B,207.0,213.0,189.0,-11.0,-5.0,PHX,CVG,1569,0,0,400
Vol_301457,31,3,1155,1422,1345,DL,387,N917DL,120.0,110.0,85.0,37.0,27.0,BWI,ATL,576,37,1,742
Vol_301458,31,3,1800,2057,2037,DL,389,N988DL,147.0,157.0,113.0,20.0,30.0,BDL,ATL,859,20,1,1110
Vol_301459,31,3,1615,1755,1804,DL,392,N6700,100.0,109.0,80.0,-9.0,0.0,ATL,FLL,581,0,0,975


### Exercice : Faire transformation pour CRSDepTime, ArrTime et CRSArrTime mais sans passer par une colonne intermédiaire

In [51]:
df['CRSDepTime'] =  [(x//100)*60+x%100 for x in  df.CRSDepTime ]

In [52]:
df['ArrTime'] = [(x//100)*60+x%100 for x in  df.ArrTime ]

In [53]:
df['CRSArrTime'] = [(x//100)*60+x%100 for x in  df.CRSArrTime ]

# Changer l'order des colonnes
La colonne DepTime2 a été créée en bout de DataFrame (tout à droite). Or, c'est celle que nous avons conservé en tant que DepTime suite à la suppression de la DepTime d'origne. On a donc changé l'ordre de nos colonnes

In [54]:
df.head()

Unnamed: 0,DayofMonth,DayOfWeek,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TotalDelay,Delayed,DepTime
Vol_0,8,1,420,557,555,US,560,N405US,132.0,135.0,117.0,2.0,5.0,PBI,DCA,857,0,0,425
Vol_1,10,3,420,560,555,US,560,N784AU,135.0,135.0,120.0,5.0,5.0,PBI,DCA,857,0,0,425
Vol_2,12,5,420,562,555,US,560,N421US,142.0,135.0,123.0,7.0,0.0,PBI,DCA,857,0,0,420
Vol_3,14,7,420,550,555,US,560,N435US,130.0,135.0,120.0,-5.0,0.0,PBI,DCA,857,0,0,420
Vol_4,16,2,420,561,555,US,560,N426US,117.0,135.0,105.0,6.0,24.0,PBI,DCA,857,0,0,444


### Exercice : réordonner les colonnes pour revenir à l'ordre d'origine

In [None]:
cols=df.colums.totlist()[2:0]+Det

# Synthèse des retards par compagnie
La méthode groupby va nous permettre d'extraire facilement des informations par compagnie : https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html

### Création d'un nouveau DataFrame

In [55]:
df_comp = pd.DataFrame(index=df.UniqueCarrier.unique())

In [56]:
df_comp

US
WN
YV
OH
OO
XE
TZ
UA
DL
EV
F9


In [57]:
df_comp.sort_index(inplace=True)

In [58]:
df_comp.head()

AA
AQ
AS
B6
CO


### Première colonne du DataFrame

In [59]:
df_comp['count'] = df.loc[:, ['UniqueCarrier', 'TotalDelay']].groupby('UniqueCarrier').count()

In [60]:
df_comp.head()

Unnamed: 0,count
AA,27349
AQ,1800
AS,6688
B6,5985
CO,12904


### Exercice : Continuer à remplir le DataFrame avec la moyenne, l'écart-type, le minimum, le quantile 25%, le quantile 50%, le quantile 75% et le maximum

In [61]:
df.describe()

Unnamed: 0,DayofMonth,DayOfWeek,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TotalDelay,Delayed,DepTime
count,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0,297324.0
mean,15.903449,3.746162,808.527169,903.665355,908.01679,2195.485692,125.134967,126.755432,102.644391,6.928381,8.548846,732.252496,10.789654,0.206993,814.659257
std,8.943123,1.987111,276.951876,297.849917,285.798374,2020.351055,71.050381,70.514544,75.888877,33.664616,30.402858,578.786571,31.205019,0.405151,283.953208
min,1.0,1.0,10.0,1.0,1.0,1.0,16.0,19.0,-1416.0,-240.0,-1020.0,24.0,0.0,0.0,1.0
25%,8.0,2.0,570.0,669.0,675.0,573.0,74.0,75.0,54.0,-9.0,-4.0,317.0,0.0,0.0,570.0
50%,16.0,4.0,800.0,916.0,919.0,1467.0,106.0,108.0,84.0,-2.0,0.0,576.0,0.0,0.0,807.0
75%,24.0,5.0,1038.0,1149.0,1144.0,3584.0,155.0,156.0,131.0,10.0,8.0,958.0,0.0,0.0,1050.0
max,31.0,7.0,1439.0,1550.0,1439.0,9501.0,1511.0,640.0,1484.0,1078.0,1070.0,4962.0,1078.0,1.0,1471.0


In [63]:
df_comp['moyenne'] = df.loc[:, ['UniqueCarrier', 'TotalDelay']].groupby('UniqueCarrier').mean()

In [64]:
df_comp['ecartType'] = df.loc[:, ['UniqueCarrier', 'TotalDelay']].groupby('UniqueCarrier').std()

In [65]:
df_comp['minimum'] = df.loc[:, ['UniqueCarrier', 'TotalDelay']].groupby('UniqueCarrier').min()

In [66]:
df_comp['quantile25'] = df.loc[:, ['UniqueCarrier', 'TotalDelay']].groupby('UniqueCarrier').quantile(q=0.25)

In [71]:
df_comp['quantile50'] = df.loc[:, ['UniqueCarrier', 'TotalDelay']].groupby('UniqueCarrier').quantile(0.5)

In [67]:
df_comp['quantile75'] = df.loc[:, ['UniqueCarrier', 'TotalDelay']].groupby('UniqueCarrier').quantile(q=0.75)

In [62]:
df_comp['maximum'] = df.loc[:, ['UniqueCarrier', 'TotalDelay']].groupby('UniqueCarrier').max()

In [75]:
df_comp

Unnamed: 0,count,maximum,moyenne,ecartType,minimum,quantile25,quantile75,quantile50
AA,27349,993,13.237486,36.670329,0,0.0,0.0,0.0
AQ,1800,235,3.396111,13.197027,0,0.0,0.0,0.0
AS,6688,523,7.711872,26.100351,0,0.0,0.0,0.0
B6,5985,653,8.839599,28.645614,0,0.0,0.0,0.0
CO,12904,592,13.964585,35.367171,0,0.0,15.0,0.0
DL,20666,560,7.618891,23.805501,0,0.0,0.0,0.0
EV,11776,1061,12.336872,32.868313,0,0.0,0.0,0.0
F9,3842,471,5.354243,18.497668,0,0.0,0.0,0.0
FL,9894,561,10.901759,29.223487,0,0.0,0.0,0.0
HA,2060,283,1.928155,12.563377,0,0.0,0.0,0.0


# Opérations plus complexes dans un groupby

### Application de la fonction finale à plusieurs colonnes

In [76]:
df.loc[:, ['UniqueCarrier', 'TotalDelay', 'CRSElapsedTime']].groupby('UniqueCarrier').max()

Unnamed: 0_level_0,TotalDelay,CRSElapsedTime
UniqueCarrier,Unnamed: 1_level_1,Unnamed: 2_level_1
AA,993,549.0
AQ,235,354.0
AS,523,414.0
B6,653,400.0
CO,592,640.0
DL,560,579.0
EV,1061,226.0
F9,471,325.0
FL,561,335.0
HA,283,385.0


### grouper selon plusieurs colonnes

In [79]:
df.loc[:, ['Origin', 'Dest', 'TotalDelay']].groupby(['Origin', 'Dest']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,TotalDelay
Origin,Dest,Unnamed: 2_level_1
ABE,ATL,394
ABE,CLE,60
ABE,CLT,0
ABE,CVG,0
ABE,JFK,0
ABE,ORD,200
ABI,DFW,87
ABQ,AMA,80
ABQ,ATL,98
ABQ,BWI,0


### Utilisation d'une lambda fonction

In [82]:
df.loc[:, ['UniqueCarrier', 'TotalDelay', 'CRSElapsedTime']].groupby('UniqueCarrier').apply(lambda x: 100*(x.TotalDelay / x.CRSElapsedTime).max())

UniqueCarrier
AA     844.827586
AQ     382.758621
AS     462.831858
B6     932.857143
CO     442.307692
DL     708.860759
EV    1583.582090
F9     336.428571
FL     623.333333
HA     282.758621
MQ    1015.942029
NW    1847.169811
OH     983.333333
OO    1320.000000
TZ     237.500000
UA     634.146341
US     659.375000
WN     611.111111
XE     738.333333
YV     603.703704
dtype: float64

### Exercice : Certains retards semblent énormes (+ de 1000% !). Isolez les lignes correspondant à ces vols et observez ce qu'il s'est passé

In [84]:
idmax=df.loc[:, ['UniqueCarrier', 'TotalDelay', 'CRSElapsedTime']].groupby('UniqueCarrier').apply(lambda x: (100*(x.TotalDelay / x.CRSElapsedTime)).idxmax())

In [91]:
df.loc[idmax]

Unnamed: 0,DayofMonth,DayOfWeek,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TotalDelay,Delayed,DepTime
Vol_235600,13,6,582,1130,640,AA,937,N069AA,75.0,58.0,42.0,490.0,473.0,MCO,MIA,193,490,1,1055
Vol_253989,2,2,600,740,629,AQ,72,N821AL,26.0,29.0,19.0,111.0,114.0,LIH,HNL,102,111,1,714
Vol_261259,27,6,525,1221,698,AS,60,N782AS,105.0,113.0,91.0,523.0,531.0,KTN,SEA,680,523,1,1056
Vol_262694,3,3,1360,643,1430,B6,128,N580JB,77.0,70.0,45.0,653.0,646.0,JFK,BTV,267,653,1,566
Vol_280808,14,7,930,1353,1008,CO,323,N79402,74.0,78.0,41.0,345.0,349.0,IAH,MSY,305,345,1,1279
Vol_297807,25,4,990,129,1009,DL,1226,N907DL,104.0,79.0,89.0,560.0,535.0,CVG,ORD,264,560,1,85
Vol_162756,16,2,360,1548,487,EV,4900,N901EV,58.0,67.0,-1402.0,1061.0,1070.0,MEI,ATL,267,1061,1,1430
Vol_165813,23,2,380,1051,580,F9,420,N912FR,130.0,140.0,113.0,471.0,481.0,SAN,DEN,853,471,1,861
Vol_171396,14,7,420,1071,510,FL,120,N281AT,98.0,90.0,74.0,561.0,553.0,TPA,ATL,406,561,1,973
Vol_178364,27,6,1120,1231,1149,HA,314,N478HA,112.0,29.0,96.0,82.0,-1.0,LIH,HNL,102,82,1,1119


# Sauvegarde des données
la méthode to_csv permet d'écrire le DataFrame modifié au format csv. L'argument index=True précise que l'on souhaite écrire l'index dans le fichier

In [89]:
df.to_csv('datas_cleaned.csv', index=True)