*Downloading and cleaning data on airports and flights (years 1998, 2008, and 2018)*

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import urllib.request as req

# Download data on flights and airports

### Airports

Airport data for comes from https://openflights.org/data.html and is being update continously.

In [2]:
# Load data
airports_raw = pd.read_csv('https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat', header = None) #load data

# Assign column names
airports_raw.columns = ["Airport_id","Name","City","Country","3DigitId","4DigitId","Lat","Lon","Altitude_ft","Timezone","DST","TZ","Type","Source"] 

# Pick relevant columns for analysis
airports_df = airports_raw[["Airport_id","Name","City","Country","3DigitId","Lat","Lon","Altitude_ft"]]

# Limit to US Airports
airports_US = airports_df[airports_df['Country']=='United States'].reset_index(drop=True)

# Pickle the dataframe for future use
airports_US.to_pickle("Data/Airports.pkl")
airports_US = pd.read_pickle("Data/Airports.pkl")

print('Rows, all:', len(airports_df.index))
print('Rows, US: ', len(airports_US.index))
print('- of which missing 3DigitId:', airports_US['3DigitId'].value_counts(dropna=False).nlargest(1)[0])
print('- of which containing 3DigitId:', airports_US['3DigitId'].nunique())
airports_US.tail(2)

Rows, all: 7698
Rows, US:  1512
- of which missing 3DigitId: 261
- of which containing 3DigitId: 1252


Unnamed: 0,Airport_id,Name,City,Country,3DigitId,Lat,Lon,Altitude_ft
1510,13759,Oswego County Airport,Fulton,United States,\N,43.3508,-76.3881,475
1511,13803,Mitchell Municipal Airport,Mitchell,United States,MHE,43.774799,-98.038597,1304


### Flights data for 1998, 2008, and 2018

We have collected flights data from Bureau of Transportation Statistics (BTS) for [1998](https://github.com/Morten-Esketveit/TSDS-gruppe-2019/tree/master/Exam/Data/1998) , [2008](https://github.com/Morten-Esketveit/TSDS-gruppe-2019/tree/master/Exam/Data/2008) and [2018](https://github.com/Morten-Esketveit/TSDS-gruppe-2019/tree/master/Exam/Data/2018).

Data for other years can be downloaded manually from https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236

To do so, first tick the following "Field Names":

* "Year"
* "Month"
* "DayofMonth"
* "DayOfWeek"
* "Flight date"
* "Reporting_Airline" (UniqueCarrier)
* "Origin"
* "Dest"
* "DepTime"
* "Actual elapsed time"
* "Distance"

Then for each month of the desired year:

1. Press download
2. Locate (or create) the folder "Data" within the cd of this .ipynb-file
3. Name the download after the number of the month, i.e. from 1-12
4. Save it in the folder

# Data cleaning for flights data

Setting up a function and running it for each of the years 1998, 2008, and 2018.

In [3]:
def clean(year):
    """ Insert year, e.g. 1998
        Reading downloaded zip files
        Preparing data for scraping, descriptive statistics, networks, and machine learning
    """
    s = str(year)
    
    # Read zip files downloaded from BTS
    data = []
    for m in range(1,13):
        df = pd.read_csv("Data/"+s+"/"+str(m)+".zip")
        data.append(df)
    Flights = pd.concat(data, axis=0).drop('Unnamed: 11', axis=1).reset_index(drop=True)
    Flights.columns = ["Year","Month","DayofMonth","DayOfWeek","Date","UniqueCarrier","Origin","Dest","DepTime","ActualElapsedTime","Distance"]

    
    ### Unique journeys for price scraping (only used 2007 flights data) ###
    scrape_data = Flights.copy()
    scrape_data["Journey"] = scrape_data["Origin"]+Flights["Dest"]
    scrape_data = scrape_data.drop_duplicates(subset = "Journey", keep = "first").reset_index(drop=True)

    # Pickle it for price scraping
    scrape_data.to_pickle("Scraping/scrape_data"+s[2:]+".pkl")
    
    
    ### Count number of flights between airports ###
    # Select variables from which to create network. Must contain two columns of nodes, since 
    # nx.from_pandas_edgelist() views each row as an edge. Can also contains attributes, e.g. Distance. 
    FlightsNx = Flights[["Origin","Dest","Distance"]]

    # Take counts (i.e. how many origin to destination flights in the dataset)
    counts = pd.DataFrame(Flights.groupby(["Origin","Dest"])["Year"].count())
    counts.columns = ["count"]
    FlightsNx = FlightsNx.merge(counts, how = 'left', on = ["Origin","Dest"])

    # Drop duplicates 
    FlightsNx = FlightsNx.drop_duplicates()

    # Create average time for each route
    TimeMeans = pd.DataFrame(Flights.groupby(["Origin","Dest"])["ActualElapsedTime"].mean())
    # Merge to FlightsNx
    FlightsNx = FlightsNx.merge(TimeMeans, how = "left", on = ["Origin", "Dest"])

    # Create number of distinct companies on each route 
    companies = pd.DataFrame(Flights.groupby(["Origin","Dest"])["UniqueCarrier"].nunique())
    companies.columns = ["companies"]
    FlightsNx = FlightsNx.merge(companies, how = "left", on = ["Origin", "Dest"])

    # Count number of take-offs from origin: 
    orig_takeoff = pd.DataFrame(FlightsNx.groupby(["Origin"])["count"].sum())
    orig_takeoff.columns = ["origin_takeoff"]
    # Merge to network dataset: 
    FlightsNx = FlightsNx.merge(orig_takeoff, how = "left", on = "Origin")

    # Count number of take-offs from dest
    dest_takeoff = pd.DataFrame(FlightsNx.groupby(["Origin"])["count"].sum())
    dest_takeoff.columns = ["dest_takeoff"]
    FlightsNx = FlightsNx.merge(dest_takeoff, how = "left", left_on = "Dest", right_on = "Origin")

    # Count number of landings at destination
    dest_lands = pd.DataFrame(FlightsNx.groupby(["Dest"])["count"].sum())
    dest_lands.columns = ["dest_landing"]
    FlightsNx = FlightsNx.merge(dest_lands, how = "left", on = "Dest")

    orig_lands = pd.DataFrame(FlightsNx.groupby(["Dest"])["count"].sum())
    orig_lands.columns = ["orig_landing"]
    FlightsNx = FlightsNx.merge(orig_lands, how = "left", left_on = "Origin", right_on = "Dest")

    # Construct totals (flights)
    FlightsNx["Origin_flights"] = FlightsNx["origin_takeoff"]+FlightsNx["orig_landing"]
    FlightsNx["Destination_flights"] = FlightsNx["dest_takeoff"]+FlightsNx["dest_landing"]

    # Drop auxiliary vars
    FlightsNx.drop(["origin_takeoff","dest_takeoff","dest_landing","orig_landing"], axis = 1, inplace = True)


    ### Implement as network ###
    Network = nx.from_pandas_edgelist(FlightsNx, source = "Origin", target = "Dest", edge_attr = True)


    ### Degree distribution ###
    # docs: https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality 
    # Find degree: 
    degree_dict = nx.algorithms.centrality.degree_centrality(Network) 
    degree_values = list(degree_dict.values())
    # Create dataframe
    degrees = pd.DataFrame(degree_values)
    degrees.columns = ["degree"]
    degrees["degree_unnormalized"] = degrees["degree"]*(len(Network.nodes)-1)
    degrees["airport"] = degree_dict.keys()
    # Save for descriptive
    degrees.to_pickle("Data/Degrees"+s[2:]+".pkl")
    
    # Add degree to FlightsNx data
    FlightsNx = FlightsNx.merge(degrees, how = "left", left_on = "Origin", right_on = "airport").drop("airport", axis = 1)
    FlightsNx = FlightsNx.rename(columns = {"degree": "origin_degree", "degree_unnormalized": "origin_degree_unn"})

    FlightsNx = FlightsNx.merge(degrees, how = "left", left_on = "Dest", right_on = "airport").drop("airport", axis = 1)
    FlightsNx = FlightsNx.rename(columns = {"degree": "dest_degree", "degree_unnormalized": "dest_degree_unn"})


    ### Betweenness centrality ###
    btwn_dict = nx.betweenness_centrality(Network)
    btwns = pd.DataFrame(list(btwn_dict.values()))
    btwns.columns = ["betweenness"]
    btwns["airport"] = btwn_dict.keys()

    FlightsNx = FlightsNx.merge(btwns, how = "left", left_on = "Origin", right_on = "airport")
    FlightsNx = FlightsNx.drop("airport", axis = 1)
    FlightsNx = FlightsNx.rename(columns = ({"betweenness":"origin_btwns"}))

    FlightsNx = FlightsNx.merge(btwns, how = "left", left_on = "Dest", right_on = "airport")
    FlightsNx = FlightsNx.drop("airport", axis = 1)
    FlightsNx = FlightsNx.rename(columns = ({"betweenness":"dest_btwns"}))

    FlightsNx.drop(["origin_degree","dest_degree"], axis = 1, inplace = True)

    FlightsNx = FlightsNx.rename(columns = ({"ActualElapsedTime":"avg_time_mins",
                                 "origin_degree_unn":"origin_degree",
                                 "dest_degree_unn":"dest_degree"}))


    ### Clustering Coefficients ###
    clusters = nx.algorithms.cluster.clustering(Network)
    clusters_df = pd.DataFrame(list(clusters.values()))
    clusters_df.columns = ["clustering"]
    clusters_df["airport"] = clusters.keys()

    FlightsNx = FlightsNx.merge(clusters_df, how = "left", left_on = "Origin", right_on = "airport")
    FlightsNx = FlightsNx.drop("airport", axis = 1)
    FlightsNx = FlightsNx.rename(columns = ({"clustering":"origin_clustcoef"}))

    FlightsNx = FlightsNx.merge(clusters_df, how = "left", left_on = "Dest", right_on = "airport")
    FlightsNx = FlightsNx.drop("airport", axis = 1)
    FlightsNx = FlightsNx.rename(columns = ({"clustering":"dest_clustcoef"}))


    # Save cleaned to pickle and try to read it
    FlightsNx.to_pickle("Data/FlightsNx"+s[2:]+".pkl")

In [4]:
clean(1998)
FlightsNx98 = pd.read_pickle("Data/FlightsNx98.pkl")
FlightsNx98.tail(2)

Unnamed: 0,Origin,Dest,Distance,count,avg_time_mins,companies,Origin_flights,Destination_flights,origin_degree,dest_degree,origin_btwns,dest_btwns,origin_clustcoef,dest_clustcoef
3155,EGE,SFO,847.0,9,154.125,1,2559.0,279737.0,12.0,51.0,3e-05,0.049559,0.969697,0.465098
3156,CLE,SRQ,967.0,17,155.0,1,122145.0,11971.0,42.0,10.0,0.003055,5e-06,0.706156,0.977778


In [5]:
clean(2008)
FlightsNx08 = pd.read_pickle("Data/FlightsNx08.pkl")
FlightsNx08.tail(2)

Unnamed: 0,Origin,Dest,Distance,count,avg_time_mins,companies,Origin_flights,Destination_flights,origin_degree,dest_degree,origin_btwns,dest_btwns,origin_clustcoef,dest_clustcoef
5245,LGB,SFO,354.0,84,77.357143,1,30764.0,283855.0,20.0,74.0,0.000449,0.046775,0.652632,0.327286
5246,SFO,LGB,354.0,81,83.592593,1,283855.0,30764.0,74.0,20.0,0.046775,0.000449,0.327286,0.652632


In [6]:
clean(2018)
FlightsNx18 = pd.read_pickle("Data/FlightsNx18.pkl")
FlightsNx18.tail(2)

Unnamed: 0,Origin,Dest,Distance,count,avg_time_mins,companies,Origin_flights,Destination_flights,origin_degree,dest_degree,origin_btwns,dest_btwns,origin_clustcoef,dest_clustcoef
6223,LGA,SBN,651.0,3,195.0,1,339632,13894,73.0,13.0,0.006823,0.000534,0.350457,0.461538
6224,ORD,ABE,654.0,225,118.458333,2,737786,8295,172.0,10.0,0.141068,0.000229,0.138923,0.511111
