In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import csv
import math
import seaborn as sns


## Categorising data

<br>1. Making carrier df
<br>2. Making airport df
<br>3. Combining all df

In [3]:
df = pd.read_csv("flights_without_cancellation_data2.csv")

In [4]:
carrier_df = pd.read_csv("data/different_carriers.csv")

In [5]:
airport_df = pd.read_csv("data/airports_in_respective_states.csv")

In [13]:
airport_df.head()

Unnamed: 0,airport,airport_name,city,state
0,YAK,Yakutat Airport,Yakutat,AK
1,GST,Gustavus Airport,Gustavus,AK
2,AKN,King Salmon Airport,King Salmon,AK
3,DLG,Dillingham Airport,Dillingham,AK
4,WRG,Wrangell Airport,Wrangell,AK


In [7]:
df.drop(columns=['Cancelled', 'CancellationCode', 'DayOfWeek'], inplace = True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9674807 entries, 0 to 9674806
Data columns (total 24 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ActualElapsedTime  float64
 1   AirTime            float64
 2   ArrDelay           float64
 3   ArrTime            float64
 4   CRSArrTime         int64  
 5   CRSDepTime         int64  
 6   CRSElapsedTime     float64
 7   CarrierDelay       float64
 8   DepDelay           float64
 9   DepTime            float64
 10  Dest               object 
 11  Distance           float64
 12  Diverted           int64  
 13  FlightNum          int64  
 14  LateAircraftDelay  float64
 15  NASDelay           float64
 16  Origin             object 
 17  SecurityDelay      float64
 18  TailNum            object 
 19  TaxiIn             float64
 20  TaxiOut            float64
 21  UniqueCarrier      object 
 22  WeatherDelay       float64
 23  Date               object 
dtypes: float64(15), int64(4), object(5)
memory usage: 

In [14]:
# Merge DataFrame A with DataFrame B for origin airport and rearrange columns
merged_on_origin = pd.merge(df, airport_df, left_on='Origin', right_on='airport', how='left')
merged_on_origin.drop(columns=['airport', 'city'], inplace = True)
merged_on_origin.rename(columns={'airport_name': 'OriginAirportName', 'state': 'OriginState'}, inplace=True)

In [15]:
# Check columns
merged_on_origin.columns.tolist()

['ActualElapsedTime',
 'AirTime',
 'ArrDelay',
 'ArrTime',
 'CRSArrTime',
 'CRSDepTime',
 'CRSElapsedTime',
 'CarrierDelay',
 'DepDelay',
 'DepTime',
 'Dest',
 'Distance',
 'Diverted',
 'FlightNum',
 'LateAircraftDelay',
 'NASDelay',
 'Origin',
 'SecurityDelay',
 'TailNum',
 'TaxiIn',
 'TaxiOut',
 'UniqueCarrier',
 'WeatherDelay',
 'Date',
 'OriginAirportName',
 'OriginState']

In [16]:
# Merge DataFrame A with DataFrame B for destination airport and rearrange columns
merged_on_dest = pd.merge(merged_on_origin, airport_df, left_on='Dest', right_on='airport', how='left')
merged_on_dest.drop(columns=['airport', 'city'], inplace = True)
merged_on_dest.rename(columns={'airport_name': 'DestAirportName', 'state': 'DestState'}, inplace=True)

In [17]:
# Check columns
merged_on_dest.columns.tolist()

['ActualElapsedTime',
 'AirTime',
 'ArrDelay',
 'ArrTime',
 'CRSArrTime',
 'CRSDepTime',
 'CRSElapsedTime',
 'CarrierDelay',
 'DepDelay',
 'DepTime',
 'Dest',
 'Distance',
 'Diverted',
 'FlightNum',
 'LateAircraftDelay',
 'NASDelay',
 'Origin',
 'SecurityDelay',
 'TailNum',
 'TaxiIn',
 'TaxiOut',
 'UniqueCarrier',
 'WeatherDelay',
 'Date',
 'OriginAirportName',
 'OriginState',
 'DestAirportName',
 'DestState']

In [18]:
desired_order = ['ActualElapsedTime',
 'AirTime',
 'ArrDelay',
 'DepDelay',
 'ArrTime',
 'DepTime',
 'CRSArrTime',
 'CRSDepTime',
 'CRSElapsedTime',
 'CarrierDelay',
 'DepDelay',
 'LateAircraftDelay',
 'SecurityDelay',
 'WeatherDelay',
 'NASDelay',
 'Distance',
 'Diverted',
 'Origin',
 'OriginAirportName',
 'OriginState',
 'Dest',
 'DestAirportName',
 'DestState',
 'TaxiIn',
 'TaxiOut',
 'TailNum',
 'UniqueCarrier',
 'FlightNum',
 'Date',
]

merged_on_dest = merged_on_dest[desired_order]

In [19]:
merged_on_dest

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,DepDelay,ArrTime,DepTime,CRSArrTime,CRSDepTime,CRSElapsedTime,CarrierDelay,...,OriginState,Dest,DestAirportName,DestState,TaxiIn,TaxiOut,TailNum,UniqueCarrier,FlightNum,Date
0,19.0,,-2.0,0.0,1404.0,1345.0,1406,1345,21.0,,...,MT,BZN,Bozeman Yellowstone International,MT,,,,DL,1807,01/01/1988
1,49.0,,-6.0,0.0,1359.0,1310.0,1405,1310,55.0,,...,PA,ABE,Lehigh Valley International,PA,,,,US,312,01/01/1988
2,26.0,,2.0,0.0,1536.0,1510.0,1534,1510,24.0,,...,KY,SDF,Louisville Muhammad Ali International,KY,,,,DL,1060,01/01/1988
3,31.0,,-2.0,-3.0,1253.0,1222.0,1255,1225,30.0,,...,NY,SYR,Syracuse Hancock International,NY,,,,PI,874,01/01/1988
4,161.0,,19.0,0.0,1051.0,910.0,1032,910,142.0,,...,NY,STL,St Louis Lambert International,MO,,,,TW,245,01/01/1988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9674802,87.0,63.0,-5.0,-8.0,749.0,622.0,754,630,84.0,,...,PA,CLE,Cleveland-Hopkins International,OH,15.0,9.0,N13123,XE,2651,31/12/2008
9674803,87.0,47.0,48.0,34.0,2011.0,1844.0,1923,1810,73.0,0.0,...,NY,BWI,Baltimore/Washington International Thurgood M...,MD,7.0,33.0,N952AT,FL,402,31/12/2008
9674804,141.0,116.0,-18.0,-1.0,1022.0,701.0,1040,702,158.0,,...,WA,DEN,Denver International,CO,10.0,15.0,N597UA,UA,386,31/12/2008
9674805,189.0,173.0,-12.0,4.0,1508.0,1059.0,1520,1055,205.0,,...,AZ,MDW,Chicago Midway International,IL,5.0,11.0,N476WN,WN,733,31/12/2008


In [20]:
# Merge on carrier names
merged_on_carrier = pd.merge(merged_on_dest, carrier_df, left_on='UniqueCarrier', right_on='carrier', how='left')
merged_on_carrier.drop(columns=['carrier'], inplace = True)
merged_on_carrier.rename(columns={'carrier_name': 'CarrierName'}, inplace=True)


In [21]:
# Check columns
merged_on_carrier.columns.tolist()

['ActualElapsedTime',
 'AirTime',
 'ArrDelay',
 'DepDelay',
 'ArrTime',
 'DepTime',
 'CRSArrTime',
 'CRSDepTime',
 'CRSElapsedTime',
 'CarrierDelay',
 'DepDelay',
 'LateAircraftDelay',
 'SecurityDelay',
 'WeatherDelay',
 'NASDelay',
 'Distance',
 'Diverted',
 'Origin',
 'OriginAirportName',
 'OriginState',
 'Dest',
 'DestAirportName',
 'DestState',
 'TaxiIn',
 'TaxiOut',
 'TailNum',
 'UniqueCarrier',
 'FlightNum',
 'Date',
 'CarrierName']

In [23]:
desired_order = ['ActualElapsedTime',
 'AirTime',
 'ArrDelay',
 'DepDelay',
 'ArrTime',
 'DepTime',
 'CRSArrTime',
 'CRSDepTime',
 'CRSElapsedTime',
 'CarrierDelay',
 'DepDelay',
 'LateAircraftDelay',
 'SecurityDelay',
 'WeatherDelay',
 'NASDelay',
 'Distance',
 'Diverted',
 'Origin',
 'OriginAirportName',
 'OriginState',
 'Dest',
 'DestAirportName',
 'DestState',
 'TaxiIn',
 'TaxiOut',
 'TailNum',
 'UniqueCarrier',
 'CarrierName',
 'FlightNum',
 'Date',
]

merged_on_carrier = merged_on_carrier[desired_order]
merged_on_carrier

Unnamed: 0,ActualElapsedTime,AirTime,ArrDelay,DepDelay,DepDelay.1,DepDelay.2,DepDelay.3,ArrTime,DepTime,CRSArrTime,...,Dest,DestAirportName,DestState,TaxiIn,TaxiOut,TailNum,UniqueCarrier,CarrierName,FlightNum,Date
0,19.0,,-2.0,0.0,0.0,0.0,0.0,1404.0,1345.0,1406,...,BZN,Bozeman Yellowstone International,MT,,,,DL,Delta Air Lines Inc.,1807,01/01/1988
1,49.0,,-6.0,0.0,0.0,0.0,0.0,1359.0,1310.0,1405,...,ABE,Lehigh Valley International,PA,,,,US,US Airways Inc.,312,01/01/1988
2,26.0,,2.0,0.0,0.0,0.0,0.0,1536.0,1510.0,1534,...,SDF,Louisville Muhammad Ali International,KY,,,,DL,Delta Air Lines Inc.,1060,01/01/1988
3,31.0,,-2.0,-3.0,-3.0,-3.0,-3.0,1253.0,1222.0,1255,...,SYR,Syracuse Hancock International,NY,,,,PI,,874,01/01/1988
4,161.0,,19.0,0.0,0.0,0.0,0.0,1051.0,910.0,1032,...,STL,St Louis Lambert International,MO,,,,TW,,245,01/01/1988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9674802,87.0,63.0,-5.0,-8.0,-8.0,-8.0,-8.0,749.0,622.0,754,...,CLE,Cleveland-Hopkins International,OH,15.0,9.0,N13123,XE,,2651,31/12/2008
9674803,87.0,47.0,48.0,34.0,34.0,34.0,34.0,2011.0,1844.0,1923,...,BWI,Baltimore/Washington International Thurgood M...,MD,7.0,33.0,N952AT,FL,AirTran Airways Corporation,402,31/12/2008
9674804,141.0,116.0,-18.0,-1.0,-1.0,-1.0,-1.0,1022.0,701.0,1040,...,DEN,Denver International,CO,10.0,15.0,N597UA,UA,United Air Lines Inc.,386,31/12/2008
9674805,189.0,173.0,-12.0,4.0,4.0,4.0,4.0,1508.0,1059.0,1520,...,MDW,Chicago Midway International,IL,5.0,11.0,N476WN,WN,Southwest Airlines Co.,733,31/12/2008


In [24]:
merged_on_carrier.to_csv("data/categorised_NONcancelled_flights_incomplete.csv", index=False)
merged_on_carrier.shape

(9674807, 36)

In [25]:
merged_on_carrier.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9674807 entries, 0 to 9674806
Data columns (total 36 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ActualElapsedTime  float64
 1   AirTime            float64
 2   ArrDelay           float64
 3   DepDelay           float64
 4   DepDelay           float64
 5   DepDelay           float64
 6   DepDelay           float64
 7   ArrTime            float64
 8   DepTime            float64
 9   CRSArrTime         int64  
 10  CRSDepTime         int64  
 11  CRSElapsedTime     float64
 12  CarrierDelay       float64
 13  DepDelay           float64
 14  DepDelay           float64
 15  DepDelay           float64
 16  DepDelay           float64
 17  LateAircraftDelay  float64
 18  SecurityDelay      float64
 19  WeatherDelay       float64
 20  NASDelay           float64
 21  Distance           float64
 22  Diverted           int64  
 23  Origin             object 
 24  OriginAirportName  object 
 25  OriginState       