This notebook is used to preprocess data collected from Kaggle so that they can be used
to visualize.

In [26]:
import pandas as pd
# import data from csv
# 469968 rows
flights = pd.read_csv("rawdata/flights_JAN.csv")
airports = pd.read_csv("rawdata/airports.csv")

# take out those records for AA, UA, DL
# result: 146875 rows
mask = flights['AIRLINE'].isin(['AA', 'UA', 'DL'])
flights = flights[mask]
flights = flights.reset_index(drop=True)

In [27]:
# combine month and day into a single column
flights["DATE"] = pd.to_datetime(flights[['YEAR', 'MONTH', 'DAY']])
# convert CANCELLED column into a boolean column
flights["CANCELLED"] = flights["CANCELLED"].astype(bool)
len(flights[flights["CANCELLED"] == True]) # 2545 are cancelled

2545

In [28]:
flights

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE
0,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,-9.0,0,False,,,,,,,2015-01-01
1,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,-9.0,0,False,,,,,,,2015-01-01
2,2015,1,1,4,DL,806,N3730B,SFO,MSP,25,...,8.0,0,False,,,,,,,2015-01-01
3,2015,1,1,4,AA,1112,N3LAAA,SFO,DFW,30,...,-13.0,0,False,,,,,,,2015-01-01
4,2015,1,1,4,DL,1173,N826DN,LAS,ATL,30,...,-15.0,0,False,,,,,,,2015-01-01
5,2015,1,1,4,DL,2336,N958DN,DEN,ATL,30,...,-30.0,0,False,,,,,,,2015-01-01
6,2015,1,1,4,AA,1674,N853AA,LAS,MIA,35,...,-10.0,0,False,,,,,,,2015-01-01
7,2015,1,1,4,DL,1434,N547US,LAX,MSP,35,...,-4.0,0,False,,,,,,,2015-01-01
8,2015,1,1,4,DL,2324,N3751B,SLC,ATL,40,...,-22.0,0,False,,,,,,,2015-01-01
9,2015,1,1,4,DL,2440,N651DL,SEA,MSP,40,...,8.0,0,False,,,,,,,2015-01-01


In [29]:
# take airport location information from airports.csv
orig_data = airports[["IATA_CODE", "LATITUDE", "LONGITUDE"]]
orig_data.columns = ["ORIGIN_AIRPORT", "ORIGIN_LATITUDE", "ORIGIN_LONGITUDE"]
merged = pd.merge(flights, orig_data, left_on='ORIGIN_AIRPORT', right_on='ORIGIN_AIRPORT', how='left')
dest_data = airports[["IATA_CODE", "LATITUDE", "LONGITUDE"]]
dest_data.columns = ["DESTINATION_AIRPORT", "DESTINATION_LATITUDE", "DESTINATION_LONGITUDE"]
merged = pd.merge(merged, dest_data, left_on='DESTINATION_AIRPORT', right_on='DESTINATION_AIRPORT', how='left')
merged.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DATE', 'ORIGIN_LATITUDE',
       'ORIGIN_LONGITUDE', 'DESTINATION_LATITUDE', 'DESTINATION_LONGITUDE'],
      dtype='object')

In [35]:
final_data = merged[["AIRLINE", 'FLIGHT_NUMBER', 'DATE', 'ORIGIN_AIRPORT', 'ORIGIN_LATITUDE',
                    'ORIGIN_LONGITUDE', 'DESTINATION_AIRPORT', 'DESTINATION_LATITUDE', 'DESTINATION_LONGITUDE',
                    'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'DISTANCE',
                    'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'CANCELLED']]
mask = final_data['ORIGIN_LATITUDE'].notna()
final_data = final_data[mask]
final_data = final_data.reset_index(drop=True)
final_data.to_csv("cleaned.csv")

In [91]:
mask = final_data['ARRIVAL_DELAY'] >= 30
delayed = final_data[mask]
delayed = delayed[["FLIGHT_NUMBER", 'ORIGIN_LATITUDE','ORIGIN_LONGITUDE', 'ORIGIN_AIRPORT']]
delayed.groupby(['ORIGIN_AIRPORT']).agg({'FLIGHT_NUMBER': 'count', "ORIGIN_LATITUDE": 'first', "ORIGIN_LONGITUDE": 'first'})

Unnamed: 0_level_0,FLIGHT_NUMBER,ORIGIN_LATITUDE,ORIGIN_LONGITUDE
ORIGIN_AIRPORT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABE,4,40.65236,-75.44040
ABQ,18,35.04022,-106.60919
ALB,15,42.74812,-73.80298
ANC,12,61.17432,-149.99619
ATL,1061,33.64044,-84.42694
ATW,2,44.25741,-88.51948
AUS,101,30.19453,-97.66987
AVL,4,35.43619,-82.54181
BDL,47,41.93887,-72.68323
BHM,26,33.56294,-86.75355


In [60]:
import numpy as np

mask = final_data['ARRIVAL_DELAY'] >= 30
delayed = final_data[mask]
delayed = delayed[["FLIGHT_NUMBER", 'ORIGIN_LATITUDE','ORIGIN_LONGITUDE']]
delayed = delayed.groupby(['ORIGIN_LATITUDE','ORIGIN_LONGITUDE']).count()
all = final_data[["FLIGHT_NUMBER", 'ORIGIN_LATITUDE','ORIGIN_LONGITUDE']]
all = all.groupby(['ORIGIN_LATITUDE','ORIGIN_LONGITUDE']).count()
new = pd.merge(all, delayed, how="outer", left_index=True, right_index=True)
new = new.fillna(0)
new["ratio"] = new["FLIGHT_NUMBER_y"] / new["FLIGHT_NUMBER_x"]

Unnamed: 0_level_0,Unnamed: 1_level_0,FLIGHT_NUMBER_x,FLIGHT_NUMBER_y,ratio
ORIGIN_LATITUDE,ORIGIN_LONGITUDE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
13.48345,-144.79598,31,4.0,0.129032
17.70189,-64.79856,55,2.0,0.036364
18.33731,-64.97336,279,40.0,0.143369
18.43942,-66.00183,843,105.0,0.124555
18.49486,-67.12944,24,4.0,0.166667
19.72026,-155.04847,13,4.0,0.307692
19.73877,-156.04563,179,14.0,0.078212
20.89865,-156.43046,364,41.0,0.112637
21.31869,-157.92241,839,69.0,0.082241
21.97598,-159.33896,151,12.0,0.079470


In [61]:
new = pd.merge(new, airports["LATITUDE", "LONGITUDE", ], left_index=True, right_on=["LATITUDE", "LONGITUDE"])
new

Unnamed: 0,FLIGHT_NUMBER_x,FLIGHT_NUMBER_y,ratio,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
137,31,4.0,0.129032,GUM,Guam International Airport,Agana,GU,USA,13.48345,-144.79598
296,55,2.0,0.036364,STX,Henry E. Rohlsen Airport,Christiansted,VI,USA,17.70189,-64.79856
295,279,40.0,0.143369,STT,Cyril E. King Airport,Charlotte Amalie,VI,USA,18.33731,-64.97336
285,843,105.0,0.124555,SJU,Luis Muñoz Marín International Airport,San Juan,PR,USA,18.43942,-66.00183
42,24,4.0,0.166667,BQN,Rafael Hernández Airport,Aguadilla,PR,USA,18.49486,-67.12944
162,13,4.0,0.307692,ITO,Hilo International Airport,Hilo,HI,USA,19.72026,-155.04847
170,179,14.0,0.078212,KOA,Kona International Airport at Keahole,Kailua/Kona,HI,USA,19.73877,-156.04563
223,364,41.0,0.112637,OGG,Kahului Airport,Kahului,HI,USA,20.89865,-156.43046
141,839,69.0,0.082241,HNL,Honolulu International Airport,Honolulu,HI,USA,21.31869,-157.92241
184,151,12.0,0.079470,LIH,Lihue Airport,Lihue,HI,USA,21.97598,-159.33896


In [85]:
mask = final_data["CANCELLED"] == True
cancelled = final_data[mask][["AIRLINE", "FLIGHT_NUMBER"]].reset_index(drop=True).groupby("AIRLINE").count()
cancelled.columns = ["cancelled_cnt"]
cancelled

mask = final_data['ARRIVAL_DELAY'] >= 30
delayed = final_data[mask][["AIRLINE", "FLIGHT_NUMBER"]].reset_index(drop=True).groupby("AIRLINE").count()
delayed.columns = ["delayed_cnt"]
delayed

total = final_data[["AIRLINE", "FLIGHT_NUMBER"]].groupby("AIRLINE").count()
total.columns = ["normal"]

airline_comp = pd.merge(cancelled, delayed, left_index=True, right_index=True)
airline_comp = pd.merge(airline_comp, total, left_index=True, right_index=True)
airline_comp["normal"] = airline_comp["normal"] - airline_comp["delayed_cnt"] - airline_comp["cancelled_cnt"]
airline_comp

Unnamed: 0_level_0,cancelled_cnt,delayed_cnt,normal
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA,900,5689,37470
DL,677,4564,59037
UA,967,5183,32245


In [83]:
sample = final_data[["CANCELLED", "AIRLINE", "FLIGHT_NUMBER", 'ARRIVAL_DELAY']]

sample["status"] = np.where(sample["CANCELLED"] == True, "cancelled", np.where(sample["ARRIVAL_DELAY"]>=30, "delayed", "normal"))

import plotly.express as px
fig = px.pie(sample[sample["AIRLINE"] == "AA"], names='status')
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



AttributeError: module 'plotly.express' has no attribute 'pie'