#INDEX
##Dataset importation and convert to table
1)[Load Data](#notebook/2590900347992364/command/2590900347992366)  
2)[Export to Table](#notebook/2590900347992364/command/2590900347992369)
##Data Aggregation and preparation
1) [Data Documentation](#notebook/1041232494344796/command/1041232494344797)  
2) [Load data and prepare data](#notebook/1041232494344796/command/300252100231806)  
3) [Export data to tables](#notebook/1041232494344796/command/300252100231807)  
##Script to get airports coordinates
[Script](#notebook/1041232494344773/command/300252100231796)
##Data analysis
1) [Load data](#notebook/1041232494344861/command/1041232494344867)  
2) [Plot with the number of flights per airport](#notebook/1041232494344861/command/1041232494344881)  
3) [Plot DataFrame with the highest and lowest numbers of flights per month](#notebook/1041232494344861/command/300252100231802)  
4) [Plot the number of canceled flights, delayed flights and average delay per state](#notebook/1041232494344861/command/1041232494344884)  
5) [Plot the rate of canceled flights and delayed flights per state](#notebook/1041232494344861/command/2740486504779300)  
6) [Network from the routes](#notebook/1041232494344861/command/1041232494344908)  
7) [Map the network for the 350 routes with more flights between 2017-2020](#notebook/1041232494344861/command/1041232494344988)
##App to find the shortest path between two airports
1) [Load Data](#notebook/1646276694320946/command/1646276694320998)  
2) [Shortest path between two airports](#notebook/1646276694320946/command/1646276694320982)

In [0]:
#!pip install networkx

In [0]:
import networkx as nx
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline

##Load Data

In [0]:
airport_info = spark.table("airports_info").toPandas()
airport_info.set_index('Code', inplace = True)
airport_info.head()

Unnamed: 0_level_0,AIPORT_LOC,AIRPORT_NAME,LAT,LON
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14842,"San Angelo, TX",San Angelo Regional/Mathis Field,31.464836,-100.439844
14843,"San Juan, PR",Luis Munoz Marin International,-25.420676,-49.268743
14869,"Salt Lake City, UT",Salt Lake City International,40.75962,-111.886798
14877,"Salina, KS",Salina Regional,38.84028,-97.611424
14893,"Sacramento, CA",Sacramento International,38.581061,-121.493895


In [0]:
network_data = spark.table("network_data").toPandas()
network_data.head()

Unnamed: 0,ORIGIN_AIRPORT_ID,DEST_AIRPORT_ID,DISTANCE,TOTAL_FLIGHTS
0,13495,12892,1670.0,8537
1,14747,14828,861.0,849
2,14747,14771,679.0,32449
3,14771,13830,2338.0,7597
4,14747,13487,1399.0,11399


In [0]:
name_dict = airport_info.to_dict('index')

##Shortest path between two airports

In [0]:
dbutils.widgets.dropdown('1.Origin Airport','Lehigh Valley International', [str(x) for x in sorted(airport_info['AIRPORT_NAME'])])
dbutils.widgets.dropdown('2.Destination Airport','Lehigh Valley International', [str(y) for y in sorted(airport_info['AIRPORT_NAME'])])

In [0]:
G = nx.Graph()
for origin in set(network_data['ORIGIN_AIRPORT_ID']):
  G.add_node(name_dict[origin]['AIRPORT_NAME'])

for i in range(len(network_data['ORIGIN_AIRPORT_ID'])):
   G.add_edge(name_dict[network_data['ORIGIN_AIRPORT_ID'][i]]['AIRPORT_NAME'], 
              name_dict[network_data['DEST_AIRPORT_ID'][i]]['AIRPORT_NAME'],
              length = network_data['DISTANCE'][i])

origin = dbutils.widgets.get('1.Origin Airport')
dest = dbutils.widgets.get('2.Destination Airport')
path = nx.shortest_path(G, origin, dest, weight='length')

path_df = pd.DataFrame()
for i in range(len(path) - 1):
  path_df = path_df.append([[path[i],path[i+1]]], ignore_index = True)
path_df = path_df.rename(columns = {0:'ORIGIN', 1:'DEST'})
path_df

# Get node position dict
x, y = airport_info['LON'].values, airport_info['LAT'].values

pos_dict = {}
for index, iata in enumerate(airport_info['AIRPORT_NAME']):
    pos_dict[iata] = (x[index], y[index])
for iata, coordinate in pos_dict.items():
    G.nodes[iata]['pos'] = coordinate

In [0]:
path_df

Unnamed: 0,ORIGIN,DEST
0,Pellston Regional Airport of Emmet County,Cherry Capital
1,Cherry Capital,Chicago O'Hare International
2,Chicago O'Hare International,Tulsa International
3,Tulsa International,Roswell International Air Center


In [0]:
mapbox_access_token = 'pk.eyJ1IjoiYW5kcmV2aWFuYS1pc2VnIiwiYSI6ImNrbnJyOWUybTJhbWgycW54bWFieXFxbG8ifQ.CU9H6Ws-MmsszrDWjidJ8A'
fig = go.Figure()

# Add nodes
fig.add_trace(go.Scattermapbox(
    lat=airport_info['LAT'],
    lon=airport_info['LON'],
    mode='markers',
    marker=go.scattermapbox.Marker(
        size=network_data['TOTAL_FLIGHTS'],
        sizeref=350,
        sizemode="area",
        color = '#FFA15A'
    ),
    hoverinfo='text',
    text=airport_info['AIRPORT_NAME']+'<br>'+'('+airport_info['LON'].astype(str)+', '+airport_info['LAT'].astype(str)+')'+'<br>'
))

# Add edges
for i in range(len(path_df)):
    fig.add_trace(
        go.Scattermapbox(
            lon=[pos_dict[path_df['ORIGIN'][i]][0], pos_dict[path_df['DEST'][i]][0]],
            lat=[pos_dict[path_df['ORIGIN'][i]][1], pos_dict[path_df['DEST'][i]][1]],
            mode='lines',
            line=dict(width=0.5, color='red'),
            hoverinfo='text',
            text= (path_df['ORIGIN'][i] +' ---> ' +path_df['DEST'][i]),
        )
    )

fig.update_layout(
    title='Flights from US',
    hovermode='closest',
    geo=dict(
        projection_type = "equirectangular",
        showland = True,
    ),
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=40,
            lon=-100
        ),
        pitch=0,
        zoom=3
    ),
    showlegend=False,
)

fig.show()