*Downloading and cleaning data on flights and airports*

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import urllib.request as req

# Download data on flights and airports

### Flights

Data comes from http://stat-computing.org/dataexpo/2009/the-data.html. Flights data relates to the year 2007 as it's the last year with complete information.

In [18]:
# Flights
filename = '2007.csv.bz2'
url = 'http://stat-computing.org/dataexpo/2009/'+str(filename)
%time req.urlretrieve(url,filename)
%time Flights = pd.read_csv(filename, compression='bz2')

Flights = Flights[["Year","Month","DayofMonth","DayOfWeek","DepTime","UniqueCarrier","Origin","Dest","Distance"]]

# Pickle un-cleaned: too big for Github, as >100 MB
# Flights.to_pickle("Flights.pkl")

print('Rows:', len(Flights.index))

Wall time: 5min 52s
Wall time: 52.1 s
Shape: (7453215, 9)


***Unique journeys for price scraping***

In [34]:
scrape_data = Flights.copy()
scrape_data["Journey"] = scrape_data["Origin"]+scrape_data["Dest"]
scrape_data = scrape_data.drop_duplicates(subset = "Journey", keep = "first").reset_index(drop=True)

# pickle it for price scraping
scrape_data.to_pickle("scrape_data.pkl")

scrape_data.tail(2)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,Journey
5340,2007,12,3,1,1045.0,9E,PIR,JAX,1395,PIRJAX
5341,2007,12,9,7,1519.0,AS,ANC,HNL,2777,ANCHNL


### Airports

Data comes from https://openflights.org/data.html

In [32]:
# Load data
airports_raw = pd.read_csv('https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat', header = None) #load data

# Assign column names
airports_raw.columns = ["Airport_id","Name","City","Country","3DigitId","4DigitId","Lat","Lon","Altitude_ft","Timezone","DST","TZ","Type","Source"] 

# Pick relevant columns for analysis
airports_df = airports_raw[["Airport_id","Name","City","Country","3DigitId","Lat","Lon","Altitude_ft"]]

# Limit to US Airports
airports_US = airports_df[airports_df['Country']=='United States'].reset_index(drop=True)

# Pickle the dataframe for future use
airports_US.to_pickle("Airports.pkl")

print('Rows, all:', len(airports_df.index))
print('Rows, US: ', len(airports_US.index))
print('Missing 3DigitId, US:', airports_US['3DigitId'].value_counts(dropna=False).nlargest(1)[0])
airports_US.tail(2)

Rows, all: 7698
Rows, US:  1512
Missing 3DigitId, US: 261


Unnamed: 0,Airport_id,Name,City,Country,3DigitId,Lat,Lon,Altitude_ft
1510,13759,Oswego County Airport,Fulton,United States,\N,43.3508,-76.3881,475
1511,13803,Mitchell Municipal Airport,Mitchell,United States,MHE,43.774799,-98.038597,1304


In [5]:
# Data on US airports from stat-computing.org
# filename = 'airports.csv'
# url = 'http://stat-computing.org/dataexpo/2009/'+str(filename)
# req.urlretrieve(url,filename)
# Airports = pd.read_csv(filename)

# print('Shape:', Airports.shape)
# Airports.head(2)

# Data cleaning for flights data

### Count number of flights between airports

In [19]:
# Select variables from which to create network. Must contain two columns of nodes, since 
# nx.from_pandas_edgelist() views each row as an edge. Can also contains attributes, e.g. Distance. 
FlightsNx = Flights[["Origin","Dest","Distance"]]

# Take counts (i.e. how many origin to destination flights in the dataset)
counts = pd.DataFrame(Flights.groupby(["Origin","Dest"])["Year"].count())
counts.columns = ["count"]
FlightsNx = FlightsNx.merge(counts, how = 'left', on = ["Origin","Dest"])

# Drop duplicates 
FlightsNx = FlightsNx.drop_duplicates()

# Count number of take-offs from origin: 
orig_takeoff = pd.DataFrame(FlightsNx.groupby(["Origin"])["count"].sum())
orig_takeoff.columns = ["origin_takeoff"]
# Merge to network dataset: 
FlightsNx = FlightsNx.merge(orig_takeoff, how = "left", on = "Origin")

# Count number of take-offs from dest
dest_takeoff = pd.DataFrame(FlightsNx.groupby(["Origin"])["count"].sum())
dest_takeoff.columns = ["dest_takeoff"]
FlightsNx = FlightsNx.merge(dest_takeoff, how = "left", left_on = "Dest", right_on = "Origin")

# Count number of landings at destination
dest_lands = pd.DataFrame(FlightsNx.groupby(["Dest"])["count"].sum())
dest_lands.columns = ["dest_landing"]
FlightsNx = FlightsNx.merge(dest_lands, how = "left", on = "Dest")

orig_lands = pd.DataFrame(FlightsNx.groupby(["Dest"])["count"].sum())
orig_lands.columns = ["orig_landing"]
FlightsNx = FlightsNx.merge(orig_lands, how = "left", left_on = "Origin", right_on = "Dest")

# Construct totals (flights)
FlightsNx["Origin_flights"] = FlightsNx["origin_takeoff"]+FlightsNx["orig_landing"]
FlightsNx["Destination_flights"] = FlightsNx["dest_takeoff"]+FlightsNx["dest_landing"]

# Drop auxiliary vars
FlightsNx.drop(["origin_takeoff","dest_takeoff","dest_landing","orig_landing"], axis = 1, inplace = True)

print('Rows:', len(FlightsNx.index))
FlightsNx.head(2)

5342


Unnamed: 0,Origin,Dest,Distance,count,Origin_flights,Destination_flights
0,SMF,ONT,389,4093,116158,83287.0
1,SMF,PDX,479,2721,116158,117237.0


### Implement as network
For function documentation, see:  
https://networkx.github.io/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html?highlight=from_pandas_edgelist#networkx.convert_matrix.from_pandas_edgelist

In [7]:
Network = nx.from_pandas_edgelist(FlightsNx, source = "Origin", target = "Dest", edge_attr = True)

### Degree distribution

In [8]:
# docs: https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality 
# Find degree: 
degree_dict = nx.algorithms.centrality.degree_centrality(Network) 
degree_values = list(degree_dict.values())
# Create dataframe
degrees = pd.DataFrame(degree_values)
degrees.columns = ["degree"]
degrees["degree_unnormalized"] = degrees["degree"]*(len(Network.nodes)-1)
degrees["airport"] = degree_dict.keys()

# Save to pickle for descriptive
degrees.to_pickle("Degrees.pkl")

In [9]:
# Add degree to FlightsNx data
FlightsNx = FlightsNx.merge(degrees, how = "left", left_on = "Origin", right_on = "airport")
FlightsNx = FlightsNx.drop("airport", axis = 1)
FlightsNx = FlightsNx.rename(columns = {"degree": "origin_degree", "degree_unnormalized": "origin_degree_unn"})

FlightsNx = FlightsNx.merge(degrees, how = "left", left_on = "Dest", right_on = "airport")
FlightsNx = FlightsNx.drop("airport", axis = 1)
FlightsNx = FlightsNx.rename(columns = {"degree": "dest_degree", "degree_unnormalized": "dest_degree_unn"})

FlightsNx.head(2)

Unnamed: 0,Origin,Dest,Distance,count,Origin_flights,Destination_flights,origin_degree,origin_degree_unn,dest_degree,dest_degree_unn
0,SMF,ONT,389,4093,116158,83287.0,0.142395,44.0,0.12945,40.0
1,SMF,PDX,479,2721,116158,117237.0,0.142395,44.0,0.139159,43.0


In [10]:
# Save cleaned to pickle
FlightsNx.to_pickle("FlightsNx.pkl") # only 0.4 MB