# Introduction
This notebook explores the American flight data in Jan 2018.

# Set up Environment

In [1]:
import pandas as pd

# be able to view all columns of dataframes
pd.options.display.max_columns = None

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
data_folder = '../data/raw/training_data/'
df = pd.read_csv(data_folder + 'On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2018_1.zip')

# Process Data

In [3]:
df.head(3)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,CancellationCode,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,TotalAddGTime,LongestAddGTime,DivAirportLandings,DivReachedDest,DivActualElapsedTime,DivArrDelay,DivDistance,Div1Airport,Div1AirportID,Div1AirportSeqID,Div1WheelsOn,Div1TotalGTime,Div1LongestGTime,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum,Div3Airport,Div3AirportID,Div3AirportSeqID,Div3WheelsOn,Div3TotalGTime,Div3LongestGTime,Div3WheelsOff,Div3TailNum,Div4Airport,Div4AirportID,Div4AirportSeqID,Div4WheelsOn,Div4TotalGTime,Div4LongestGTime,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Unnamed: 109
0,2018,1,1,27,6,2018-01-27,UA,19977,UA,N26232,369,11697,1169706,32467,FLL,"Fort Lauderdale, FL",FL,12,Florida,33,12266,1226603,31453,IAH,"Houston, TX",TX,48,Texas,74,615,602.0,-13.0,0.0,0.0,-1.0,0600-0659,19.0,621.0,749.0,7.0,808,756.0,-12.0,0.0,0.0,-1.0,0800-0859,0.0,,0.0,173.0,174.0,148.0,1.0,966.0,4,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018,1,1,27,6,2018-01-27,UA,19977,UA,N477UA,368,14747,1474703,30559,SEA,"Seattle, WA",WA,53,Washington,93,14771,1477104,32457,SFO,"San Francisco, CA",CA,6,California,91,618,614.0,-4.0,0.0,0.0,-1.0,0600-0659,16.0,630.0,808.0,5.0,831,813.0,-18.0,0.0,0.0,-2.0,0800-0859,0.0,,0.0,133.0,119.0,98.0,1.0,679.0,3,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2018,1,1,27,6,2018-01-27,UA,19977,UA,N13720,367,11278,1127805,30852,DCA,"Washington, DC",VA,51,Virginia,38,12266,1226603,31453,IAH,"Houston, TX",TX,48,Texas,74,830,828.0,-2.0,0.0,0.0,-1.0,0800-0859,17.0,845.0,1055.0,13.0,1107,1108.0,1.0,1.0,0.0,0.0,1100-1159,0.0,,0.0,217.0,220.0,190.0,1.0,1208.0,5,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 570118 entries, 0 to 570117
Columns: 110 entries, Year to Unnamed: 109
dtypes: float64(70), int64(21), object(19)
memory usage: 478.5+ MB


In [5]:
num_cancelled_flights = df[df['Cancelled']==1]['Year'].count()
num_diverted_flights = df[df['Diverted']==1]['Year'].count()
print('Number of cancelled flights in Jan 2018: ', num_cancelled_flights)
print('Number of diverted flights in Jan 2018: ', num_diverted_flights)

total_rows = df.shape[0]
percentage = round((num_cancelled_flights+num_diverted_flights) / total_rows * 100,2)
print('Percentage of cancelled or diverted flights: ', percentage, '%')

Number of cancelled flights in Jan 2018:  17169
Number of diverted flights in Jan 2018:  1249
Percentage of cancelled or diverted flights:  3.23 %


In [6]:
not_cancelled = df['Cancelled']==0
not_diverted = df['Diverted']==0
normal_flights = not_cancelled & not_diverted

temp = df.copy()
flights = temp[normal_flights]

Next, drop columns related to cancellation or divertion and irrelevant column.

In [7]:
flights.head(3)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,Cancelled,CancellationCode,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,TotalAddGTime,LongestAddGTime,DivAirportLandings,DivReachedDest,DivActualElapsedTime,DivArrDelay,DivDistance,Div1Airport,Div1AirportID,Div1AirportSeqID,Div1WheelsOn,Div1TotalGTime,Div1LongestGTime,Div1WheelsOff,Div1TailNum,Div2Airport,Div2AirportID,Div2AirportSeqID,Div2WheelsOn,Div2TotalGTime,Div2LongestGTime,Div2WheelsOff,Div2TailNum,Div3Airport,Div3AirportID,Div3AirportSeqID,Div3WheelsOn,Div3TotalGTime,Div3LongestGTime,Div3WheelsOff,Div3TailNum,Div4Airport,Div4AirportID,Div4AirportSeqID,Div4WheelsOn,Div4TotalGTime,Div4LongestGTime,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum,Unnamed: 109
0,2018,1,1,27,6,2018-01-27,UA,19977,UA,N26232,369,11697,1169706,32467,FLL,"Fort Lauderdale, FL",FL,12,Florida,33,12266,1226603,31453,IAH,"Houston, TX",TX,48,Texas,74,615,602.0,-13.0,0.0,0.0,-1.0,0600-0659,19.0,621.0,749.0,7.0,808,756.0,-12.0,0.0,0.0,-1.0,0800-0859,0.0,,0.0,173.0,174.0,148.0,1.0,966.0,4,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2018,1,1,27,6,2018-01-27,UA,19977,UA,N477UA,368,14747,1474703,30559,SEA,"Seattle, WA",WA,53,Washington,93,14771,1477104,32457,SFO,"San Francisco, CA",CA,6,California,91,618,614.0,-4.0,0.0,0.0,-1.0,0600-0659,16.0,630.0,808.0,5.0,831,813.0,-18.0,0.0,0.0,-2.0,0800-0859,0.0,,0.0,133.0,119.0,98.0,1.0,679.0,3,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2018,1,1,27,6,2018-01-27,UA,19977,UA,N13720,367,11278,1127805,30852,DCA,"Washington, DC",VA,51,Virginia,38,12266,1226603,31453,IAH,"Houston, TX",TX,48,Texas,74,830,828.0,-2.0,0.0,0.0,-1.0,0800-0859,17.0,845.0,1055.0,13.0,1107,1108.0,1.0,1.0,0.0,0.0,1100-1159,0.0,,0.0,217.0,220.0,190.0,1.0,1208.0,5,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
field_list = ['FirstDepTime', 'TotalAddGTime', 'LongestAddGTime', 'DivAirportLandings', 'DivReachedDest',\
              'DivActualElapsedTime', 'DivArrDelay', 'DivDistance', 'Div1Airport', 'Div1AirportID', 'Div1AirportSeqID',\
              'Div1WheelsOn', 'Div1TotalGTime', 'Div1LongestGTime', 'Div1WheelsOff', 'Div1TailNum',  'Div2Airport', \
              'Div2AirportID', 'Div2AirportSeqID', 'Div2WheelsOn', 'Div2TotalGTime', 'Div2LongestGTime', 'Div2WheelsOff',\
              'Div2TailNum', 'Div3Airport', 'Div3AirportID', 'Div3AirportSeqID', 'Div3WheelsOn', 'Div3TotalGTime',\
              'Div3LongestGTime', 'Div3WheelsOff', 'Div3TailNum', 'Div4Airport', 'Div4AirportID', 'Div4AirportSeqID', \
              'Div4WheelsOn', 'Div4TotalGTime', 'Div4LongestGTime', 'Div4WheelsOff', 'Div4TailNum', 'Div5Airport', \
              'Div5AirportID', 'Div5AirportSeqID', 'Div5WheelsOn', 'Div5TotalGTime', 'Div5LongestGTime', \
              'Div5WheelsOff', 'Div5TailNum', 'Cancelled', 'CancellationCode', 'Diverted']

for field in field_list:
    flights = flights.drop(labels=field, axis=1)
    
flights.drop(flights.columns[flights.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

In [9]:
flights.shape

(551700, 58)

Find the airport with the most arrival flights:

In [10]:
flights['DestAirportID'].value_counts(ascending=False).head(5)

10397    29709
13930    24997
11298    21754
11292    17980
12892    17545
Name: DestAirportID, dtype: int64

In [11]:
flights[flights['DestAirportID']==10397]['Dest'].head(1)

35    ATL
Name: Dest, dtype: object

The most frequently visited airport has airport ID 10397, code being ATL. Find all flight traffic in ATL in Jan 2018:

In [12]:
arrivals = flights['DestAirportID']==10397
departures = flights['OriginAirportID']==10397
all_flights = arrivals | departures
flights_ATL = flights[all_flights]

In [13]:
flights_ATL.shape

(59401, 58)