# Explore Methods

In [1]:
import pandas as pd
import numpy as np

import timeit

## Load Data

In [2]:
df = pd.read_csv("flights_sample.csv", index_col = 0)
print(df.shape)
df.head(10)

(1000, 2)


Unnamed: 0_level_0,origin,destination
day,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-16 00:00:00+00:00,KEWR,KDFW
2019-01-01 00:00:00+00:00,KONT,KPHX
2019-01-09 00:00:00+00:00,LSZH,LTBA
2019-01-15 00:00:00+00:00,KSFO,KSAN
2019-01-01 00:00:00+00:00,PANC,MS65
2019-01-01 00:00:00+00:00,EBAW,EBMB
2019-01-09 00:00:00+00:00,KSLC,KOAK
2019-01-21 00:00:00+00:00,KDEN,KAUS
2019-01-22 00:00:00+00:00,KLAX,KRNO
2019-01-28 00:00:00+00:00,LTBA,DAAG


## Create Adjacency Matrix

In [3]:
airports = np.unique(np.append(df["origin"], df["destination"]))
print(airports.shape)
airports

(664,)


array(['01FA', '02XS', '06FA', '06TE', '0GA1', '0GA2', '0NY0', '0WI5',
       '13AZ', '14FA', '16FA', '16TX', '1AZ2', '1CO4', '1LA1', '1OR3',
       '1WI6', '1XS1', '20GA', '20II', '21XS', '27CA', '2AZ7', '2KS9',
       '2TS2', '32II', '35MN', '38MO', '3GE9', '3IN7', '3MD4', '3PS9',
       '44PA', '4FL8', '4NY4', '4TS8', '54MI', '57FA', '59TX', '5TA4',
       '60CO', '60IN', '60MI', '64NC', '6NJ9', '71MI', '72FL', '73WA',
       '77TS', '78MI', '79NY', '7CO0', '7KS6', '7MI6', '7PS7', '80TX',
       '82FL', '82PA', '86MI', '8CA8', '8MI4', '8OR6', '8TX7', '96WA',
       '97PN', '97TS', '98NY', '9IN3', 'AK67', 'AL18', 'AL72', 'AZ28',
       'BIKF', 'BKPR', 'CA35', 'CA39', 'CA66', 'CL36', 'CMN3', 'CNC4',
       'CO12', 'CO17', 'CO80', 'CSS3', 'CYHU', 'CYKZ', 'CYOW', 'CYTN',
       'CYUL', 'CYVR', 'CYXX', 'CYYG', 'CYYZ', 'DAAG', 'EBAW', 'EBBR',
       'EBCI', 'EBLG', 'EBMB', 'EDBA', 'EDDB', 'EDDE', 'EDDF', 'EDDH',
       'EDDK', 'EDDL', 'EDDM', 'EDDN', 'EDDP', 'EDDS', 'EDDT', 'EDDV',
      

In [4]:
# Create nxn-matrix of zeros
A = np.zeros((len(airports), len(airports)), dtype = int)
print(A.shape)
A

(664, 664)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
# Map airports in dataframe to their graph number
mapping_dict = {k:i for i,k in enumerate(airports)}

df_mapped = df.applymap(lambda x: mapping_dict[x])
print(df_mapped.shape)
df_mapped.head(10)

(1000, 2)


Unnamed: 0_level_0,origin,destination
day,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-16 00:00:00+00:00,284,272
2019-01-01 00:00:00+00:00,372,385
2019-01-09 00:00:00+00:00,522,525
2019-01-15 00:00:00+00:00,415,408
2019-01-01 00:00:00+00:00,572,537
2019-01-01 00:00:00+00:00,94,98
2019-01-09 00:00:00+00:00,419,368
2019-01-21 00:00:00+00:00,271,236
2019-01-22 00:00:00+00:00,331,403
2019-01-28 00:00:00+00:00,525,93


In [6]:
# Fill adjacency matrix
for date, flight in df_mapped.iterrows():
    i, j = flight["origin"], flight["destination"]
    if A[i,j] == 0:
        A[i,j], A[j,i] = 1,1

A

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

## Analysis 1: Can you fly from A to B directly?

### Conventional Pandas Method

Can you fly from a to b directly? <br>
How many destinations can you reach from a directly?

In [70]:
def exists_direct_path(df, node1, node2):
    
    # Loop through each row
    for i, row in df.iterrows():
        
        # Check whether (node1, node2) or (node2, node1) is in the dataset
        if {row["origin"], row["destination"]} == {node1,node2}:
            return True
        
    return False

In [97]:
node1, node2 = "KEWR", "KDFW"
exists_direct_path(df, node1, node2)

True

In [108]:
test_size = 100

np.random.seed(10)
test_airports = np.random.choice(airports,test_size)

start = timeit.default_timer()

for i in range(0,test_size,2):
    exists_direct_path(df, test_airports[i], test_airports[i+1])
    
stop = timeit.default_timer()
print("Time:", round(stop-start,2), "sec")

Time: 3.24 sec


### Graph Method

In [98]:
node1, node2 = mapping_dict["KEWR"], mapping_dict["KDFW"]
A[node1, node2] == 1

True

In [107]:
test_size = 100

np.random.seed(10)
test_airports = np.random.choice(airports,test_size)

start = timeit.default_timer()

for i in range(0,test_size,2):
    node1, node2 = mapping_dict[test_airports[i]], mapping_dict[test_airports[i+1]]
    A[node1, node2] == 1
    
stop = timeit.default_timer()
print("Time:", round(stop-start,2), "sec")

Time: 0.0 sec


## Analysis 2: How many destinations is A directly linked to?

### Conventional Pandas Method

In [None]:
def degree(df, node):
    
    # Setup empty list
    flights = []
    
    # Loop through every row
    for i, row in df.iterrows():
        
        # If the node is either an origin or a destination, there must be a direct path
        if row["origin"] == node:
            flights.append(row["destination"])
        elif row["destination"] == node:
            flights.append(row["origin"])
    
    # Remove duplicates
    flights = list(set(flights))
    
    return len(flights)

In [105]:
node = "KEWR"
degree(df, node)

13

In [110]:
test_size = 100

np.random.seed(10)
test_airports = np.random.choice(airports,test_size)

start = timeit.default_timer()

for i in range(0,test_size,2):
    degree(df, test_airports[i])
    
stop = timeit.default_timer()
print("Time:", round(stop-start,2), "sec")

Time: 3.23 sec


## Graph Method

In [106]:
node = mapping_dict["KEWR"]
A[node].sum()

13

In [111]:
test_size = 100

np.random.seed(10)
test_airports = np.random.choice(airports,test_size)

start = timeit.default_timer()

for i in range(0,test_size,2):
    node = mapping_dict[test_airports[i]]
    A[node].sum()
    
stop = timeit.default_timer()
print("Time:", round(stop-start,2), "sec")

Time: 0.0 sec


## Analysis 3 : Shortest Path from A to B

### Graph Method

In [60]:
def shortest_path(A, x, y, iterations = 10):
    
    M = A.copy()
    
    i, j = mapping_dict[x], mapping_dict[y]
    iterations = 10

    for k in range(iterations):
        if M[i,j] == 0:
            M = np.matmul(M,A)
        else:
            return (x,y,k)

In [13]:
home_airport = "LFPO"
vacation_destinations = ["YWHA", "LTAC", "LIRF", "EVRA", "KRFI"]
max_changes = 2

n_changes = [shortest_path(A, home_airport, destination) for destination in vacation_destinations]
acceptable = [destination for location, destination, changes in n_changes if changes <= 2]
acceptable

['LIRF', 'EVRA']

In [48]:
np.random.seed(10)
test_airports = np.random.choice(airports,20)

start = timeit.default_timer()

for i in range(0,20,2):
    shortest_path(A, test_airports[i], test_airports[i+1], iterations = 10)
    
stop = timeit.default_timer()
print("Time:", round(stop-start,2), "sec")

Time: 18.06 sec
