# Basic description of the data sets
Data comes from http://stat-computing.org/dataexpo/2009/the-data.html.
* Variable descriptions are also shown at that url.
* As shown below, there are 7453215 observations of 29 variables.
* All variable names can be seen below.
* The dataset has 304 unique departure airports and 310 unique destination airports (relatively few?).

In [1]:
## US airports data# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

## US domestic flights data, cleaned

In [None]:
FlightsNx = pd.read_pickle("FlightsNx.pkl")
print('Rows:', len(FlightsNx.index))
print(FlightsNx.columns)
print('There are {} unique departure airports in the dataset'.format(FlightsNx.Origin.nunique()))
print('There are {} unique destination airports in the dataset'.format(FlightsNx.Dest.nunique()))
FlightsNx["Journey"] = FlightsNx["Origin"]+FlightsNx["Dest"]
print('There are {} unique journeys in the dataset'.format(FlightsNx.Journey.nunique()))

### Using NetworkX

In [None]:
Network = nx.from_pandas_edgelist(FlightsNx, source = "Origin", target = "Dest", edge_attr = True)
print(nx.info(Network))

### Sparse

In [None]:
# calculate L_max:  
print("L_max = {}".format(len(Network.nodes)*(len(Network.nodes)-1)*(1/2)))
print(2868/47895)

### Degree distribution

In [None]:
degrees = pd.read_pickle("Degrees.pkl")

In [None]:
print('No. airports w. degree < 5:  ', len(degrees.loc[(degrees["degree_unnormalized"] < 5)].index))
print('No. airports w. degree > 100:', len(degrees.loc[(degrees["degree_unnormalized"] > 100)].index))

# See most connected airports in dataset
degrees.sort_values(by = "degree_unnormalized", ascending = False).head(10)

In [None]:
# Degree distribution
fig, ax = plt.subplots(figsize = (6,6))
plt.hist(degrees["degree_unnormalized"], bins = 70)
plt.ylim(0,80)
plt.show()

### Betweenness centrality

In [None]:
btwn = nx.betweenness_centrality(Network)
betweenness = list(btwn.values())
btwn_df = pd.DataFrame(betweenness)
btwn_df.columns = ["betweenness"]
btwn_df["airport"] = btwn.keys()

btwn_df.sort_values(by = "betweenness", ascending = False)
Node_char = degrees.merge(btwn_df, on= "airport")

Node_char["betweenness"].hist(bins = 100)

#### (node) betweenness centrality vs degree

In [None]:
# Betweenness vs. degree
fig, ax = plt.subplots(figsize = (7,7))
plt.scatter(Node_char["degree"],Node_char["betweenness"])
plt.xlabel("Normalized Degree")
plt.ylabel("Betweenness")
plt.show()

### Measures for entire network

In [None]:
# print(nx.info(Network))
print("The average shortest path is: {}".format(nx.average_shortest_path_length(Network)))
print("The average clustering coefficient is: {}".format(nx.average_clustering(Network)))
print("The diameter of the network is: {}".format(nx.diameter(Network)))
print("The global efficiency of the network is: {}".format(nx.global_efficiency(Network)))

### Shortest path length: Atlanta example

In [None]:
avg_shrt_path_len = nx.average_shortest_path_length(Network)
print(avg_shrt_path_len)

In [None]:
## US airports data# Test: Atlanta
test = FlightsNx.loc[(FlightsNx["Origin"] != "ATL") & (FlightsNx["Dest"] != "ATL")]
Nettest = nx.from_pandas_edgelist(test, source = "Origin", target = "Dest", edge_attr = None)
ATL_avg = nx.average_shortest_path_length(Nettest)
print(ATL_avg)

## US airports data

In [None]:
Airports = pd.read_pickle("Airports.pkl")

# Get list of nodes, and add lat, lon from Airports dataset:
nodes = pd.DataFrame(list(Network.nodes()))
nodes.columns = ["3DigitId"]
positions = nodes.merge(Airports[["3DigitId","Lat","Lon"]], how = "left", on = "3DigitId")

print('No. airports in the openflights data:', len(Airports))
print('- of which missing 3DigitId:', Airports['3DigitId'].value_counts(dropna=False).nlargest(1)[0])
print('- of which containing 3DigitId:', Airports['3DigitId'].nunique())
print('No. airports within the network:', positions['3DigitId'].nunique())
positions.head(2)