In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from shapely.geometry import Point
import geopandas as gpd
import copy
import networkx as nx

### Dataset for prediction
Each observation/row is a connection (edge in the network). 
We want the following **route-specific** information: 
* Number of flights ("count") **done**
* Distance (average distance?) **done**
* Tid (average time?)  **done** 


We want the following **airport-specific** information: 
* Degree **done** 
* Clustering coefficient **done**
* Betweenness centrality **done** 
* State? (administrative border?) **excluded**


## Load data 
Data comes from http://stat-computing.org/dataexpo/2009/. 
Flight data relates to the year 2007 (see file StatComp_MER) 

In [2]:
# Load data
Flights = pd.read_pickle("Flights98.pkl")
#Airports = pd.read_pickle("airports.pkl")

# Select variables from which to create network. Must contain two columns of nodes, since 
# nx.from_pandas_edgelist() views each row as an edge. Can also contains attributes, e.g. Distance. 
FlightsNx = Flights[["Origin","Dest","Distance"]]

# Take counts (i.e. how many origin to destination flights in the dataset)
counts = pd.DataFrame(Flights.groupby(["Origin","Dest"])["Year"].count())
counts.columns = ["count"]
FlightsNx = FlightsNx.merge(counts, how = 'left', on = ["Origin","Dest"])

# Drop duplicates 
FlightsNx = FlightsNx.drop_duplicates()

In [3]:
# Create average time for each route
TimeMeans = pd.DataFrame(Flights.groupby(["Origin","Dest"])["ActualElapsedTime"].mean())
# Merge to FlightsNx
FlightsNx = FlightsNx.merge(TimeMeans, how = "left", on = ["Origin", "Dest"])

In [4]:
# Create number of distinct companies on each route 
companies = pd.DataFrame(Flights.groupby(["Origin","Dest"])["UniqueCarrier"].nunique())
companies.columns = ["companies"]
FlightsNx = FlightsNx.merge(companies, how = "left", on = ["Origin", "Dest"])

In [5]:
# Count number of take-offs from origin: 
orig_takeoff = pd.DataFrame(FlightsNx.groupby(["Origin"])["count"].sum())
orig_takeoff.columns = ["origin_takeoff"]
# Merge to network dataset: 
FlightsNx = FlightsNx.merge(orig_takeoff, how = "left", on = "Origin")

# Count number of take-offs from dest
dest_takeoff = pd.DataFrame(FlightsNx.groupby(["Origin"])["count"].sum())
dest_takeoff.columns = ["dest_takeoff"]
FlightsNx = FlightsNx.merge(dest_takeoff, how = "left", left_on = "Dest", right_on = "Origin")

# Count number of landings at destination
dest_lands = pd.DataFrame(FlightsNx.groupby(["Dest"])["count"].sum())
dest_lands.columns = ["dest_landing"]
FlightsNx = FlightsNx.merge(dest_lands, how = "left", on = "Dest")

orig_lands = pd.DataFrame(FlightsNx.groupby(["Dest"])["count"].sum())
orig_lands.columns = ["orig_landing"]
FlightsNx = FlightsNx.merge(orig_lands, how = "left", left_on = "Origin", right_on = "Dest")

# Construct totals (flights)
FlightsNx["Origin_flights"] = FlightsNx["origin_takeoff"]+FlightsNx["orig_landing"]
FlightsNx["Destination_flights"] = FlightsNx["dest_takeoff"]+FlightsNx["dest_landing"]

# Drop auxiliary vars
FlightsNx.drop(["origin_takeoff","dest_takeoff","dest_landing","orig_landing"], axis = 1, inplace = True)

### Degree distribution

In [6]:
Network = nx.from_pandas_edgelist(FlightsNx, source = "Origin", target = "Dest", edge_attr = True)
# docs: https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality 
# Find degree: 
degree_dict = nx.algorithms.centrality.degree_centrality(Network) 
degree_values = list(degree_dict.values())
# Create dataframe
degrees = pd.DataFrame(degree_values)
degrees.columns = ["degree"]
degrees["degree_unnormalized"] = degrees["degree"]*(len(Network.nodes)-1)
degrees["airport"] = degree_dict.keys()

In [7]:
# Add degree to FlightsNx data
FlightsNx = FlightsNx.merge(degrees, how = "left", left_on = "Origin", right_on = "airport")
FlightsNx = FlightsNx.drop("airport", axis = 1)
FlightsNx = FlightsNx.rename(columns = {"degree": "origin_degree", "degree_unnormalized": "origin_degree_unn"})

FlightsNx = FlightsNx.merge(degrees, how = "left", left_on = "Dest", right_on = "airport")
FlightsNx = FlightsNx.drop("airport", axis = 1)
FlightsNx = FlightsNx.rename(columns = {"degree": "dest_degree", "degree_unnormalized": "dest_degree_unn"})


#### Betweenness centrality

In [8]:
btwn_dict = nx.betweenness_centrality(Network)
btwns = pd.DataFrame(list(btwn_dict.values()))
btwns.columns = ["betweenness"]
btwns["airport"] = btwn_dict.keys()

FlightsNx = FlightsNx.merge(btwns, how = "left", left_on = "Origin", right_on = "airport")
FlightsNx = FlightsNx.drop("airport", axis = 1)
FlightsNx = FlightsNx.rename(columns = ({"betweenness":"origin_btwns"}))

FlightsNx = FlightsNx.merge(btwns, how = "left", left_on = "Dest", right_on = "airport")
FlightsNx = FlightsNx.drop("airport", axis = 1)
FlightsNx = FlightsNx.rename(columns = ({"betweenness":"dest_btwns"}))

In [9]:
#FlightsNx.drop(["origin_degree","dest_degree"], axis = 1, inplace = True)

In [10]:
FlightsNx = FlightsNx.rename(columns = ({"ActualElapsedTime":"avg_time_mins"}))
#                             "origin_degree_unn":"origin_degree",
#                             "dest_degree_unn":"dest_degree"}))

#### Clustering Coefficients 

In [11]:
clusters = nx.algorithms.cluster.clustering(Network)
clusters_df = pd.DataFrame(list(clusters.values()))
clusters_df.columns = ["clustering"]
clusters_df["airport"] = clusters.keys()

FlightsNx = FlightsNx.merge(clusters_df, how = "left", left_on = "Origin", right_on = "airport")
FlightsNx = FlightsNx.drop("airport", axis = 1)
FlightsNx = FlightsNx.rename(columns = ({"clustering":"origin_clustcoef"}))

FlightsNx = FlightsNx.merge(clusters_df, how = "left", left_on = "Dest", right_on = "airport")
FlightsNx = FlightsNx.drop("airport", axis = 1)
FlightsNx = FlightsNx.rename(columns = ({"clustering":"dest_clustcoef"}))


In [12]:
FlightsNx.to_pickle("FlightsML98.pkl")
FlightsNx.to_pickle("FlightsNx98.pkl")
FlightsNx.head()

Unnamed: 0,Origin,Dest,Distance,count,avg_time_mins,companies,Origin_flights,Destination_flights,origin_degree,origin_degree_unn,dest_degree,dest_degree_unn,origin_btwns,dest_btwns,origin_clustcoef,dest_clustcoef
0,TYS,ORD,475.0,1045,90.543564,1,16494.0,589806.0,0.028846,6.0,0.456731,95.0,0.0,0.07908,1.0,0.26495
1,ATL,ORF,516.0,2180,93.10663,1,510667.0,32654.0,0.475962,99.0,0.081731,17.0,0.118966,7e-05,0.238714,0.919118
2,BOS,ORF,468.0,308,93.453608,1,196837.0,32654.0,0.221154,46.0,0.081731,17.0,0.019007,7e-05,0.611594,0.919118
3,BWI,ORF,159.0,3,42.333333,1,139975.0,32654.0,0.221154,46.0,0.081731,17.0,0.003668,7e-05,0.597101,0.919118
4,CLT,ORF,290.0,2142,62.160936,1,249947.0,32654.0,0.326923,68.0,0.081731,17.0,0.073496,7e-05,0.332309,0.919118


In [13]:
FlightsNx[["Origin","origin_degree"]].sort_values("origin_degree", ascending = False).drop_duplicates()

Unnamed: 0,Origin,origin_degree
1207,ATL,99.0
562,ORD,95.0
1107,MSP,88.0
2036,DFW,87.0
685,DTW,81.0
2125,STL,77.0
2169,PIT,71.0
1671,IAH,71.0
43,CVG,70.0
1159,CLT,68.0


In [14]:
## Homophily? 
## " Does these features contribute? i.e. raise predictive performance?"
## Number of different companies?