# EDA Script to find patterns in delays, cancellations, or any contributing factors to delays

### Business Question: How can we enhance operational efficiency by identifying patterns in delays, cancellations, or specific contributing factors?

In [1]:
import pandas as pd
import os
import json
import requests

In [2]:
airline_df = pd.read_csv("../dataset/airline.csv")
airline_df.head()

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
0,154.0,122.0,90.0,1850.0,1720,1455,145.0,,0,0.0,...,6,23.0,ORD,0.0,N293AA,8.0,24.0,AA,0.0,2006
1,159.0,125.0,59.0,1703.0,1604,1510,114.0,,0,,...,9,,DTW,,N8921E,19.0,15.0,NW,,1997
2,,71.0,,,1140,1037,63.0,,1,,...,2,,CVG,,N331DL,4.0,15.0,DL,,1995
3,70.0,,80.0,20.0,2300,2100,60.0,,0,,...,2,,MDW,,,,,ML (1),,1991
4,150.0,135.0,5.0,2030.0,2025,1740,165.0,,0,,...,2,,PHL,,N512AU,3.0,12.0,US,,1997


In [3]:
airline_df["Distance"] = airline_df["Distance"] * 1.60934
airline_df.head()

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
0,154.0,122.0,90.0,1850.0,1720,1455,145.0,,0,0.0,...,6,23.0,ORD,0.0,N293AA,8.0,24.0,AA,0.0,2006
1,159.0,125.0,59.0,1703.0,1604,1510,114.0,,0,,...,9,,DTW,,N8921E,19.0,15.0,NW,,1997
2,,71.0,,,1140,1037,63.0,,1,,...,2,,CVG,,N331DL,4.0,15.0,DL,,1995
3,70.0,,80.0,20.0,2300,2100,60.0,,0,,...,2,,MDW,,,,,ML (1),,1991
4,150.0,135.0,5.0,2030.0,2025,1740,165.0,,0,,...,2,,PHL,,N512AU,3.0,12.0,US,,1997


In [4]:
airline_df.shape

(9882798, 29)

### Cleaning

In [5]:
cleaned_df = airline_df[airline_df['Cancelled'] == 0]
cleaned_df.head()

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,Month,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,Year
0,154.0,122.0,90.0,1850.0,1720,1455,145.0,,0,0.0,...,6,23.0,ORD,0.0,N293AA,8.0,24.0,AA,0.0,2006
1,159.0,125.0,59.0,1703.0,1604,1510,114.0,,0,,...,9,,DTW,,N8921E,19.0,15.0,NW,,1997
3,70.0,,80.0,20.0,2300,2100,60.0,,0,,...,2,,MDW,,,,,ML (1),,1991
4,150.0,135.0,5.0,2030.0,2025,1740,165.0,,0,,...,2,,PHL,,N512AU,3.0,12.0,US,,1997
5,110.0,,10.0,1450.0,1440,1300,100.0,,0,,...,2,,BDL,,,,,CO,,1989


In [6]:
cleaned_df.shape

(9698787, 29)

In [7]:
delay_factors = ["CarrierDelay", "LateAircraftDelay", "NASDelay", "SecurityDelay", "WeatherDelay"]

cleaned_df = cleaned_df.dropna(subset=delay_factors)
cleaned_df.shape

(2690088, 29)

In [8]:
cleaned_df.dropna(subset=['ActualElapsedTime', 'AirTime', 'ArrDelay', 'ArrTime'], inplace=True)

In [9]:
cleaned_df.rename(columns={'DayofMonth':'DayOfMonthDep', 'Month':'MonthDep', 'Year':'YearDep'}, inplace=True)
cleaned_df.head()

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,ArrTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CancellationCode,Cancelled,CarrierDelay,...,MonthDep,NASDelay,Origin,SecurityDelay,TailNum,TaxiIn,TaxiOut,UniqueCarrier,WeatherDelay,YearDep
0,154.0,122.0,90.0,1850.0,1720,1455,145.0,,0,0.0,...,6,23.0,ORD,0.0,N293AA,8.0,24.0,AA,0.0,2006
8,293.0,272.0,-1.0,1543.0,1544,750,294.0,,0,0.0,...,5,0.0,LAX,0.0,N788UA,8.0,13.0,UA,0.0,2007
13,121.0,101.0,-6.0,809.0,815,715,120.0,,0,0.0,...,3,0.0,HOU,0.0,N725SW,14.0,6.0,WN,0.0,2007
18,162.0,142.0,5.0,1001.0,956,820,156.0,,0,0.0,...,10,0.0,MEM,0.0,N970SW,9.0,11.0,OO,0.0,2006
19,113.0,92.0,-25.0,1534.0,1559,1345,134.0,,0,0.0,...,8,0.0,ATL,0.0,N17620,11.0,10.0,CO,0.0,2004


In [10]:
cleaned_df["Origin"].value_counts()

ATL    156260
ORD    137708
DFW    119750
LAX     87113
IAH     78388
        ...  
EAU         7
MKG         6
ITH         3
PIR         1
CKB         1
Name: Origin, Length: 320, dtype: int64

In [11]:
cleaned_df["DayOfMonthArr"] = cleaned_df["DayOfMonthDep"]
cleaned_df["MonthArr"] = cleaned_df["MonthDep"]
cleaned_df['YearArr'] = cleaned_df['YearDep']

cleaned_df['CRSDayOfMonthArr'] = cleaned_df["DayOfMonthDep"]
cleaned_df["CRSDayOfMonthDep"] = cleaned_df["DayOfMonthDep"]
cleaned_df["CRSMonthArr"] = cleaned_df["MonthDep"]
cleaned_df["CRSMonthDep"] = cleaned_df["MonthDep"]
cleaned_df['CRSYearArr'] = cleaned_df['YearDep']
cleaned_df['CRSYearDep'] = cleaned_df['YearDep']

In [12]:
cleaned_df = cleaned_df[[
    'DepTime', 
    'ArrTime', 
    'CRSDepTime', 
    'CRSArrTime', 
    'DayOfMonthDep', 
    'MonthDep', 
    'YearDep', 
    'DayOfMonthArr', 
    'MonthArr', 
    'YearArr',
    'CRSDayOfMonthArr', 
    'CRSMonthArr', 
    'CRSYearArr', 
    'CRSDayOfMonthDep', 
    'CRSMonthDep', 
    'CRSYearDep', 
    'Distance', 
    'TaxiIn', 
    'TaxiOut', 
    'UniqueCarrier', 
    'FlightNum', 
    'Origin'
]]

cleaned_df

Unnamed: 0,DepTime,ArrTime,CRSDepTime,CRSArrTime,DayOfMonthDep,MonthDep,YearDep,DayOfMonthArr,MonthArr,YearArr,...,CRSYearArr,CRSDayOfMonthDep,CRSMonthDep,CRSYearDep,Distance,TaxiIn,TaxiOut,UniqueCarrier,FlightNum,Origin
0,1616.0,1850.0,1455,1720,21,6,2006,21,6,2006,...,2006,21,6,2006,1290.69068,8.0,24.0,AA,2337,ORD
8,750.0,1543.0,750,1544,8,5,2007,8,5,2007,...,2007,8,5,2007,3682.16992,8.0,13.0,UA,946,LAX
13,708.0,809.0,715,815,16,3,2007,16,3,2007,...,2007,16,3,2007,1089.52318,14.0,6.0,WN,219,HOU
18,819.0,1001.0,820,956,22,10,2006,22,10,2006,...,2006,22,10,2006,1403.34448,9.0,11.0,OO,6607,MEM
19,1341.0,1534.0,1345,1559,15,8,2004,15,8,2004,...,2004,15,8,2004,1198.95830,11.0,10.0,CO,1160,ATL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9882765,1317.0,1624.0,1310,1610,12,12,2006,12,12,2006,...,2006,12,12,2006,1153.89678,3.0,13.0,MQ,4324,ORD
9882768,1003.0,1651.0,955,1631,24,10,2007,24,10,2007,...,2007,24,10,2007,2554.02258,10.0,13.0,US,610,PHX
9882770,1401.0,1604.0,1405,1627,2,4,2004,2,4,2004,...,2004,2,4,2004,1987.53490,9.0,10.0,CO,1027,BWI
9882772,738.0,1042.0,745,1048,30,8,2006,30,8,2006,...,2006,30,8,2006,1293.90936,16.0,10.0,UA,1494,RNO


In [13]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2684691 entries, 0 to 9882796
Data columns (total 22 columns):
 #   Column            Dtype  
---  ------            -----  
 0   DepTime           float64
 1   ArrTime           float64
 2   CRSDepTime        int64  
 3   CRSArrTime        int64  
 4   DayOfMonthDep     int64  
 5   MonthDep          int64  
 6   YearDep           int64  
 7   DayOfMonthArr     int64  
 8   MonthArr          int64  
 9   YearArr           int64  
 10  CRSDayOfMonthArr  int64  
 11  CRSMonthArr       int64  
 12  CRSYearArr        int64  
 13  CRSDayOfMonthDep  int64  
 14  CRSMonthDep       int64  
 15  CRSYearDep        int64  
 16  Distance          float64
 17  TaxiIn            float64
 18  TaxiOut           float64
 19  UniqueCarrier     object 
 20  FlightNum         int64  
 21  Origin            object 
dtypes: float64(5), int64(15), object(2)
memory usage: 471.1+ MB


In [14]:
cleaned_df[['DepTime', 'ArrTime', 'CRSDepTime', 'CRSArrTime']] = cleaned_df[['DepTime', 'ArrTime', 'CRSDepTime', 'CRSArrTime']].astype(int).astype(str)
cleaned_df['DepTime'] = cleaned_df['DepTime'].str.zfill(4)
cleaned_df['ArrTime'] = cleaned_df['ArrTime'].str.zfill(4)
cleaned_df['CRSDepTime'] = cleaned_df['CRSDepTime'].str.zfill(4)
cleaned_df['CRSArrTime'] = cleaned_df['CRSArrTime'].str.zfill(4)

In [15]:
cleaned_df.loc[cleaned_df['CRSArrTime'] == cleaned_df['CRSDepTime']]

Unnamed: 0,DepTime,ArrTime,CRSDepTime,CRSArrTime,DayOfMonthDep,MonthDep,YearDep,DayOfMonthArr,MonthArr,YearArr,...,CRSYearArr,CRSDayOfMonthDep,CRSMonthDep,CRSYearDep,Distance,TaxiIn,TaxiOut,UniqueCarrier,FlightNum,Origin
254,0803,0800,0800,0800,10,2,2007,10,2,2007,...,2007,10,2,2007,411.99104,2.0,7.0,WN,2743,PHX
1431,0719,0716,0720,0720,9,11,2005,9,11,2005,...,2005,9,11,2005,370.14820,4.0,12.0,OH,5085,CVG
10937,1525,1525,1500,1500,2,1,2006,2,1,2006,...,2006,2,1,2006,411.99104,5.0,10.0,WN,2348,PHX
11142,1515,1505,1515,1515,1,6,2006,1,6,2006,...,2006,1,6,2006,461.88058,3.0,5.0,WN,229,BOI
11527,1020,1028,1020,1020,3,9,2004,3,9,2004,...,2004,3,9,2004,457.05256,5.0,8.0,WN,1585,CMH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9878909,1916,1942,1920,1920,6,4,2005,6,4,2005,...,2005,6,4,2005,370.14820,8.0,28.0,OH,5418,CVG
9880696,0800,0756,0800,0800,3,6,2003,3,6,2003,...,2003,3,6,2003,465.09926,2.0,6.0,WN,785,LBB
9881113,1715,1733,1705,1705,14,10,2004,14,10,2004,...,2004,14,10,2004,368.53886,3.0,31.0,WN,417,DTW
9881158,0710,0706,0715,0715,7,5,2007,7,5,2007,...,2007,7,5,2007,445.78718,3.0,7.0,WN,534,AMA


In [16]:
cleaned_df.drop(cleaned_df[cleaned_df['CRSArrTime'] == cleaned_df['CRSDepTime']].index, inplace=True)

In [17]:
cleaned_df = cleaned_df.drop(cleaned_df[cleaned_df['CRSArrTime'] == cleaned_df['CRSDepTime']].index)

### Converting dates and time

In [18]:
import time
import numpy as np

In [19]:
def adjust_dep_time(DepTime, DayOfMonthDep, MonthDep, YearDep):    
    month_day_dict = {
        1 : 31,
        2 : 28,
        3 : 31,
        4 : 30,
        5 : 31,
        6 : 30,
        7 : 31,
        8 : 31,
        9 : 30,
        10 : 31,
        11 : 30,
        12 : 31
    }            
            
    if int(DepTime) >= 2400:
        DepTime = str(int(DepTime) - 2400)
        if DayOfMonthDep == month_day_dict[MonthDep]:
            DayOfMonthDep = 1
            
            if MonthDep == 12:
                MonthDep = 1
                YearDep += 1
            else:
                MonthDep += 1
        else:
            DayOfMonthDep += 1
    
    return f"{DayOfMonthDep}-{MonthDep}-{YearDep} {DepTime.zfill(4)}"

v_adjust_time = np.vectorize(adjust_dep_time, otypes=[object])

cleaned_df['DepDateTime'] = v_adjust_time(cleaned_df['DepTime'], cleaned_df['DayOfMonthDep'], cleaned_df['MonthDep'], cleaned_df['YearDep'])

In [20]:
def add_extra_time(ArrTime, DepTime):
    ArrTime1=ArrTime
    if int(ArrTime) < int(DepTime):
        ArrTime = str(int(ArrTime) + 2400)
    return ArrTime

v_adjust_time = np.vectorize(add_extra_time, otypes=[object])

cleaned_df['ArrTimeAdded'] = v_adjust_time(cleaned_df['ArrTime'], cleaned_df['DepTime'])

In [21]:
def adjust_arr_time(ArrTime, DepTime, DayOfMonthArr, MonthArr, YearArr):    
    month_day_dict = {
        1 : 31,
        2 : 28,
        3 : 31,
        4 : 30,
        5 : 31,
        6 : 30,
        7 : 31,
        8 : 31,
        9 : 30,
        10 : 31,
        11 : 30,
        12 : 31
    }
        
    if int(ArrTime) >= 2400:
        ArrTime = str(int(ArrTime) - 2400)
        
        if (int(ArrTime) >= 2400):
            return f"ERROR {ArrTime}"
        
        if DayOfMonthArr == month_day_dict[MonthArr]:
            DayOfMonthArr = 1
            
            if MonthArr == 12:
                MonthArr = 1
                YearArr += 1
            else:
                MonthArr += 1
        else:
            DayOfMonthArr += 1
    
    
    return f"{DayOfMonthArr}-{MonthArr}-{YearArr} {ArrTime.zfill(4)}"

v_adjust_time = np.vectorize(adjust_arr_time, otypes=[object])

cleaned_df['ArrDateTime'] = v_adjust_time(cleaned_df['ArrTimeAdded'], cleaned_df['DepTime'], cleaned_df['DayOfMonthArr'], cleaned_df['MonthArr'], cleaned_df['YearArr'])

In [22]:
def adjust_CRS_dep_time(CRSDepTime, CRSDayOfMonthDep, CRSMonthDep, CRSYearDep):    
    
    return f"{CRSDayOfMonthDep}-{CRSMonthDep}-{CRSYearDep} {CRSDepTime.zfill(4)}"

v_adjust_time = np.vectorize(adjust_CRS_dep_time, otypes=[object])

cleaned_df['CRSDepDateTime'] = v_adjust_time(cleaned_df['CRSDepTime'], cleaned_df['CRSDayOfMonthDep'], cleaned_df['CRSMonthDep'], cleaned_df['CRSYearDep'])

In [23]:
def adjust_CRS_arr_time(CRSDepTime, CRSArrTime, CRSMonthDep, CRSDayOfMonthArr, CRSMonthArr, CRSYearArr):    
    month_day_dict = {
        1 : 31,
        2 : 28,
        3 : 31,
        4 : 30,
        5 : 31,
        6 : 30,
        7 : 31,
        8 : 31,
        9 : 30,
        10 : 31,
        11 : 30,
        12 : 31
    }

    if int(CRSDepTime) > int(CRSArrTime):
        
        if CRSDayOfMonthArr >= month_day_dict[CRSMonthDep]: 
            CRSDayOfMonthArr = 1
            
            if CRSMonthArr == 12:
                CRSMonthArr = 1
                CRSYearArr += 1
                
            else:
                CRSMonthArr += 1
        else:
            CRSDayOfMonthArr += 1
    
    return f"{CRSDayOfMonthArr}-{CRSMonthArr}-{CRSYearArr} {CRSArrTime.zfill(4)}"

v_adjust_time = np.vectorize(adjust_CRS_arr_time, otypes=[object])

cleaned_df['CRSArrDateTime'] = v_adjust_time(cleaned_df['CRSDepTime'], cleaned_df['CRSArrTime'], cleaned_df['CRSMonthDep'], cleaned_df['CRSDayOfMonthArr'], cleaned_df['CRSMonthArr'], cleaned_df['CRSYearArr'])

In [24]:
df_to_add = cleaned_df[['DepDateTime', 'ArrDateTime', 'CRSArrDateTime', 'CRSDepDateTime']]

cleaned_df = pd.concat([cleaned_df, df_to_add], axis=1)
cleaned_df

Unnamed: 0,DepTime,ArrTime,CRSDepTime,CRSArrTime,DayOfMonthDep,MonthDep,YearDep,DayOfMonthArr,MonthArr,YearArr,...,Origin,DepDateTime,ArrTimeAdded,ArrDateTime,CRSDepDateTime,CRSArrDateTime,DepDateTime.1,ArrDateTime.1,CRSArrDateTime.1,CRSDepDateTime.1
0,1616,1850,1455,1720,21,6,2006,21,6,2006,...,ORD,21-6-2006 1616,1850,21-6-2006 1850,21-6-2006 1455,21-6-2006 1720,21-6-2006 1616,21-6-2006 1850,21-6-2006 1720,21-6-2006 1455
8,0750,1543,0750,1544,8,5,2007,8,5,2007,...,LAX,8-5-2007 0750,1543,8-5-2007 1543,8-5-2007 0750,8-5-2007 1544,8-5-2007 0750,8-5-2007 1543,8-5-2007 1544,8-5-2007 0750
13,0708,0809,0715,0815,16,3,2007,16,3,2007,...,HOU,16-3-2007 0708,0809,16-3-2007 0809,16-3-2007 0715,16-3-2007 0815,16-3-2007 0708,16-3-2007 0809,16-3-2007 0815,16-3-2007 0715
18,0819,1001,0820,0956,22,10,2006,22,10,2006,...,MEM,22-10-2006 0819,1001,22-10-2006 1001,22-10-2006 0820,22-10-2006 0956,22-10-2006 0819,22-10-2006 1001,22-10-2006 0956,22-10-2006 0820
19,1341,1534,1345,1559,15,8,2004,15,8,2004,...,ATL,15-8-2004 1341,1534,15-8-2004 1534,15-8-2004 1345,15-8-2004 1559,15-8-2004 1341,15-8-2004 1534,15-8-2004 1559,15-8-2004 1345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9882765,1317,1624,1310,1610,12,12,2006,12,12,2006,...,ORD,12-12-2006 1317,1624,12-12-2006 1624,12-12-2006 1310,12-12-2006 1610,12-12-2006 1317,12-12-2006 1624,12-12-2006 1610,12-12-2006 1310
9882768,1003,1651,0955,1631,24,10,2007,24,10,2007,...,PHX,24-10-2007 1003,1651,24-10-2007 1651,24-10-2007 0955,24-10-2007 1631,24-10-2007 1003,24-10-2007 1651,24-10-2007 1631,24-10-2007 0955
9882770,1401,1604,1405,1627,2,4,2004,2,4,2004,...,BWI,2-4-2004 1401,1604,2-4-2004 1604,2-4-2004 1405,2-4-2004 1627,2-4-2004 1401,2-4-2004 1604,2-4-2004 1627,2-4-2004 1405
9882772,0738,1042,0745,1048,30,8,2006,30,8,2006,...,RNO,30-8-2006 0738,1042,30-8-2006 1042,30-8-2006 0745,30-8-2006 1048,30-8-2006 0738,30-8-2006 1042,30-8-2006 1048,30-8-2006 0745


In [25]:
cleaned_df.drop(columns=[
    'ArrTime',
    'DepTime',
    'DayOfMonthDep',
    'DayOfMonthArr',
    'MonthDep',
    'MonthArr',
    'YearDep',
    'YearArr',
    'CRSDayOfMonthArr',
    'CRSDayOfMonthDep',
    'CRSMonthArr',
    'CRSMonthDep',
    'CRSYearArr',
    'CRSYearDep'
], inplace = True)

cleaned_df

Unnamed: 0,CRSDepTime,CRSArrTime,Distance,TaxiIn,TaxiOut,UniqueCarrier,FlightNum,Origin,DepDateTime,ArrTimeAdded,ArrDateTime,CRSDepDateTime,CRSArrDateTime,DepDateTime.1,ArrDateTime.1,CRSArrDateTime.1,CRSDepDateTime.1
0,1455,1720,1290.69068,8.0,24.0,AA,2337,ORD,21-6-2006 1616,1850,21-6-2006 1850,21-6-2006 1455,21-6-2006 1720,21-6-2006 1616,21-6-2006 1850,21-6-2006 1720,21-6-2006 1455
8,0750,1544,3682.16992,8.0,13.0,UA,946,LAX,8-5-2007 0750,1543,8-5-2007 1543,8-5-2007 0750,8-5-2007 1544,8-5-2007 0750,8-5-2007 1543,8-5-2007 1544,8-5-2007 0750
13,0715,0815,1089.52318,14.0,6.0,WN,219,HOU,16-3-2007 0708,0809,16-3-2007 0809,16-3-2007 0715,16-3-2007 0815,16-3-2007 0708,16-3-2007 0809,16-3-2007 0815,16-3-2007 0715
18,0820,0956,1403.34448,9.0,11.0,OO,6607,MEM,22-10-2006 0819,1001,22-10-2006 1001,22-10-2006 0820,22-10-2006 0956,22-10-2006 0819,22-10-2006 1001,22-10-2006 0956,22-10-2006 0820
19,1345,1559,1198.95830,11.0,10.0,CO,1160,ATL,15-8-2004 1341,1534,15-8-2004 1534,15-8-2004 1345,15-8-2004 1559,15-8-2004 1341,15-8-2004 1534,15-8-2004 1559,15-8-2004 1345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9882765,1310,1610,1153.89678,3.0,13.0,MQ,4324,ORD,12-12-2006 1317,1624,12-12-2006 1624,12-12-2006 1310,12-12-2006 1610,12-12-2006 1317,12-12-2006 1624,12-12-2006 1610,12-12-2006 1310
9882768,0955,1631,2554.02258,10.0,13.0,US,610,PHX,24-10-2007 1003,1651,24-10-2007 1651,24-10-2007 0955,24-10-2007 1631,24-10-2007 1003,24-10-2007 1651,24-10-2007 1631,24-10-2007 0955
9882770,1405,1627,1987.53490,9.0,10.0,CO,1027,BWI,2-4-2004 1401,1604,2-4-2004 1604,2-4-2004 1405,2-4-2004 1627,2-4-2004 1401,2-4-2004 1604,2-4-2004 1627,2-4-2004 1405
9882772,0745,1048,1293.90936,16.0,10.0,UA,1494,RNO,30-8-2006 0738,1042,30-8-2006 1042,30-8-2006 0745,30-8-2006 1048,30-8-2006 0738,30-8-2006 1042,30-8-2006 1048,30-8-2006 0745


In [26]:
cleaned_df.to_csv("../dataset/cleaned_airlines_with_delays.csv", index=False)

### Sentiment Analysis of WMO Codes (00 - 99)

In [27]:
with open("../dataset/mwo_codes.json", "r") as file:
    wmo_codes = json.load(file)

print (wmo_codes)

{'00': 'Cloud development not observed or not observable (Characteristic change of the state of sky during the past hour)', '01': 'Clouds generally deissolving or becoming less developed (Characteristic change of the state of sky during the past hour)', '02': 'State of sky on the whole unchanged (Characteristic change of the state of sky during the past hour)', '03': 'Clouds generally forming or developing (Characteristic change of the state of sky during the past hour)', '04': 'Visibility reduced by smoke, e.g. veld or forest fires, industrial smoke or volcanic ashes', '05': 'Haze', '06': 'Widespread dust in suspension in the air, not raised by wind at or near the station at the time of observation', '07': 'Dust or sand raised by wind at or near the station at the time of observation, but no well-developed dust whirl(s) or sand whirl(s), and no duststorm or sandstorm seen', '08': 'well developed dust whirl(s) or sand whirl(s) seen at or near the station during the preceding hour or at

#### Start analysis

In [28]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification")
candidate_labels = ["safe for flight departure", "dangerous for flight departure"]

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [29]:
safe_codes = []
dangerous_codes = []

for code, description in wmo_codes.items():
    classification = classifier([description], candidate_labels=candidate_labels)
    predicted_label = classification[0].get("labels")[0]

    if predicted_label == "safe for flight departure":
        safe_codes.append(int(code))
    else:
        dangerous_codes.append(int(code))

wmo_classified_codes = {
    "safe_codes": safe_codes,
    "dangerous_codes": dangerous_codes
}

print (wmo_classified_codes)

{'safe_codes': [0, 1, 2, 3, 7, 11, 12, 13, 14, 16, 20, 21, 22, 30, 50, 51, 52, 53, 54, 55, 60, 61, 62, 63, 64, 70, 71, 72, 73, 74, 76, 77, 78, 80, 89, 90, 95], 'dangerous_codes': [4, 5, 6, 8, 9, 10, 15, 17, 18, 19, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 56, 57, 58, 59, 65, 66, 67, 68, 69, 75, 79, 81, 82, 83, 84, 85, 86, 87, 88, 91, 92, 93, 94, 96, 97, 98, 99]}


In [30]:
with open ("../dataset/wmo_classfied_codes", "w") as output_file:
    json.dump(wmo_classified_codes, output_file, indent=4)

### Loading of endpoints

In [72]:
airport_coord_file = open("../dataset/airport_coord.json")

coord_data = json.load(airport_coord_file)

# for airport in cleaned_df["Origin"]:
#     for code, info in coord_data.items():
#         if code == airport:
#             cleaned_df["Lat"] = info.get("latitude")
#             cleaned_df["Long"] = info.get("longitude")
#             cleaned_df["Region"] = info.get("region")

def add_airport_info(cleaned_df, coord_data):
    airport_info_by_code = {code: info for code, info in coord_data.items()}

    if pd.api.types.is_string_dtype(cleaned_df['Origin']):
        mask = cleaned_df['Origin'].isin(airport_info_by_code.keys())
        def get_airport_info(airport):
            info = airport_info_by_code.get(airport)
            return info if info is not None else {'latitude': None, 'longitude': None, 'region': None}
        cleaned_df.loc[mask, 'Lat'] = cleaned_df[mask]['Origin'].apply(get_airport_info).apply(lambda x: x['latitude'])
        cleaned_df.loc[mask, 'Long'] = cleaned_df[mask]['Origin'].apply(get_airport_info).apply(lambda x: x['longitude'])
        cleaned_df.loc[mask, 'Region'] = cleaned_df[mask]['Origin'].apply(get_airport_info).apply(lambda x: x['region'])
    else:
        for i, airport in cleaned_df['Origin'].items():
          if airport in airport_info_by_code:
            cleaned_df.loc[i, 'Lat'] = airport_info_by_code[airport].get('latitude')
            cleaned_df.loc[i, 'Long'] = airport_info_by_code[airport].get('longitude')
        cleaned_df.loc[i, 'Region'] = airport_info_by_code[airport].get('region')

    return cleaned_df

cleaned_df = add_airport_info(cleaned_df.copy(), coord_data.copy())
print(cleaned_df)

airport_coord_file.close()

        CRSDepTime CRSArrTime    Distance  TaxiIn  TaxiOut UniqueCarrier  \
0             1455       1720  1290.69068     8.0     24.0            AA   
8             0750       1544  3682.16992     8.0     13.0            UA   
13            0715       0815  1089.52318    14.0      6.0            WN   
18            0820       0956  1403.34448     9.0     11.0            OO   
19            1345       1559  1198.95830    11.0     10.0            CO   
...            ...        ...         ...     ...      ...           ...   
9882765       1310       1610  1153.89678     3.0     13.0            MQ   
9882768       0955       1631  2554.02258    10.0     13.0            US   
9882770       1405       1627  1987.53490     9.0     10.0            CO   
9882772       0745       1048  1293.90936    16.0     10.0            UA   
9882796       2015       2345  1647.96416     6.0     27.0            F9   

         FlightNum Origin      DepDateTime ArrTimeAdded      ArrDateTime  \
0          

In [74]:
cleaned_df["Lat"].isnull().sum()

0

In [75]:
cleaned_df.to_csv("../dataset/delayed_flights_left_weatherData.csv")