# Homework 5 - USA Airport Flight Analysis

In [None]:
import pandas as pd
from typing import Dict, Any, List, Tuple, Set
from collections import defaultdict


## EDA

In [3]:

file_path = "archive_usa_airport\Airports2.csv"
df = pd.read_csv(file_path)
df.isna().sum()

Origin_airport               0
Destination_airport          0
Origin_city                  0
Destination_city             0
Passengers                   0
Seats                        0
Flights                      0
Distance                     0
Fly_date                     0
Origin_population            0
Destination_population       0
Org_airport_lat           6954
Org_airport_long          6954
Dest_airport_lat          6807
Dest_airport_long         6807
dtype: int64

In [4]:

def find_missing_coordinates(df: pd.DataFrame) -> Tuple[Set[Tuple[str, str]], int]:
    """
    Find the airports with missing data.
    """
    missing_org = df[(df['Org_airport_lat'].isna() & df['Org_airport_long'].isna())]
    missing_dest = df[(df['Dest_airport_lat'].isna() & df['Dest_airport_long'].isna())]
    
    unique_origin = set(zip(missing_org['Origin_airport'], missing_org['Origin_city']))
    unique_dest = set(zip(missing_dest['Destination_airport'], missing_dest['Destination_city']))
    
    unique_airports = unique_origin.union(unique_dest)
    return unique_airports, len(unique_airports)
unique_airports, missing_count = find_missing_coordinates(df)
print(f"Numero di aeroporti con coordinate mancanti: {missing_count}")




Numero di aeroporti con coordinate mancanti: 243


In [6]:
def export_missing_airports(airports: Set[Tuple[str, str]], output_file: str) -> None:
    """
    Export the missing airports to a CSV file. In order to fetch coordinates through API
    """
    df_missing = pd.DataFrame(list(airports), columns=['Airport Code', 'City'])
    df_missing.to_csv(output_file, index=False, sep=';')

export_missing_airports(unique_airports, 'archive_usa_airport/airports_without_coordinates.csv')



In [8]:

coordinates_file = "archive_usa_airport/airport_coordinates_.csv"
coordinates_df = pd.read_csv(coordinates_file, sep=';')

def create_airport_mapping(coordinates_df: pd.DataFrame) -> Dict:
    """
    A mapping of airport codes to their corresponding coordinates.
    """
    return coordinates_df.set_index('airport_code')[['latitude', 'longitude']].to_dict(orient='index')

airport_mapping = create_airport_mapping(coordinates_df)

def fill_missing_coordinates(df: pd.DataFrame, airport_mapping: Dict) -> pd.DataFrame:
   
   
    df['Org_airport_lat'] = df['Org_airport_lat'].fillna(
        df['Origin_airport'].map(lambda x: airport_mapping.get(x, {}).get('latitude'))
    )
    df['Org_airport_long'] = df['Org_airport_long'].fillna(
        df['Origin_airport'].map(lambda x: airport_mapping.get(x, {}).get('longitude'))
    )
    
    df['Dest_airport_lat'] = df['Dest_airport_lat'].fillna(
        df['Destination_airport'].map(lambda x: airport_mapping.get(x, {}).get('latitude'))
    )
    df['Dest_airport_long'] = df['Dest_airport_long'].fillna(
        df['Destination_airport'].map(lambda x: airport_mapping.get(x, {}).get('longitude'))
    )
    
    return df


df_final = fill_missing_coordinates(df, airport_mapping)
df_final.isna().sum()

Origin_airport            0
Destination_airport       0
Origin_city               0
Destination_city          0
Passengers                0
Seats                     0
Flights                   0
Distance                  0
Fly_date                  0
Origin_population         0
Destination_population    0
Org_airport_lat           0
Org_airport_long          0
Dest_airport_lat          0
Dest_airport_long         0
dtype: int64

In [9]:
df_final

Unnamed: 0,Origin_airport,Destination_airport,Origin_city,Destination_city,Passengers,Seats,Flights,Distance,Fly_date,Origin_population,Destination_population,Org_airport_lat,Org_airport_long,Dest_airport_lat,Dest_airport_long
0,MHK,AMW,"Manhattan, KS","Ames, IA",21,30,1,254,2008-10-01,122049,86219,39.140999,-96.670799,42.026757,-93.617045
1,EUG,RDM,"Eugene, OR","Bend, OR",41,396,22,103,1990-11-01,284093,76034,44.124599,-123.211998,44.254101,-121.150002
2,EUG,RDM,"Eugene, OR","Bend, OR",88,342,19,103,1990-12-01,284093,76034,44.124599,-123.211998,44.254101,-121.150002
3,EUG,RDM,"Eugene, OR","Bend, OR",11,72,4,103,1990-10-01,284093,76034,44.124599,-123.211998,44.254101,-121.150002
4,MFR,RDM,"Medford, OR","Bend, OR",0,18,1,156,1990-02-01,147300,76034,42.374199,-122.873001,44.254101,-121.150002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3606798,STL,TBN,"St. Louis, MO","Fort Leonard Wood, MO",281,969,51,119,2009-02-01,2828990,46457,38.748699,-90.370003,37.741600,-92.140701
3606799,STL,TBN,"St. Louis, MO","Fort Leonard Wood, MO",245,1026,54,119,2009-11-01,2828990,46457,38.748699,-90.370003,37.741600,-92.140701
3606800,STL,TBN,"St. Louis, MO","Fort Leonard Wood, MO",363,1273,67,119,2009-08-01,2828990,46457,38.748699,-90.370003,37.741600,-92.140701
3606801,CGI,TBN,"Cape Girardeau, MO","Fort Leonard Wood, MO",2,19,1,146,2009-08-01,93712,46457,37.225300,-89.570801,37.741600,-92.140701


## 1. Flight Network Analysis (Q1)

In [11]:
import sys
import os

# Add the 'auxiliary files' folder to the system path
sys.path.append(os.path.join(os.getcwd(), 'auxiliary files'))

from flight_network import FlightNetwork

def analyze_graph(flight_network: FlightNetwork) -> Dict[str, Any]:
    n_airports = len(flight_network.nodes)
    n_flights = len(flight_network.edges)
    
    density = (2 * n_flights) / (n_airports * (n_airports - 1)) if n_airports > 1 else 0.0
    
    in_degrees = {node: flight_network.in_degree(node) for node in flight_network.nodes}
    out_degrees = {node: flight_network.out_degree(node) for node in flight_network.nodes}
    
    hubs = [node for node in flight_network.nodes if flight_network.in_degree(node) + flight_network.out_degree(node) > 2]
    
    return {
        'n_airports': n_airports,
        'n_flights': n_flights,
        'density': density,
        'in_degrees': in_degrees,
        'out_degrees': out_degrees,
        'hubs': hubs
    }

df 
network = FlightNetwork()
network.add_nodes_and_edges(df_final['Origin_airport'], df_final['Destination_airport'])
results = analyze_graph(network)
print("Network Analysis Results:")
print(results)



Network Analysis Results:
{'n_airports': 727, 'n_flights': 36719, 'density': 0.1391392984490396, 'in_degrees': {'HKS': 82, 'CLE': 56031, 'ERI': 2161, 'MKC': 165, 'ABQ': 21806, 'GMU': 25, 'VIS': 240, 'RIV': 499, 'STK': 1, 'IRK': 104, 'GSB': 11, 'FFM': 9, 'MOT': 1271, 'CPR': 1418, 'AR1': 1, 'SFB': 2812, 'FYV': 873, 'FFT': 6, 'EAU': 566, 'DMA': 20, 'OCF': 75, 'RBG': 231, 'OSU': 3, 'ART': 219, 'PHD': 1, 'BTR': 5625, 'PNC': 81, 'OAJ': 852, 'ICT': 8999, 'DAL': 13542, 'SHV': 8353, 'GWO': 10, 'CSG': 1301, 'MRC': 105, 'YKM': 1724, 'NZY': 133, 'GRR': 11120, 'WRB': 21, 'ACT': 1746, 'MHR': 1011, 'MXF': 7, 'ISM': 7, 'ODW': 249, 'AIY': 1, 'DNE': 2, 'PDK': 92, 'CEV': 3, 'DMO': 0, 'RBL': 1, 'LCI': 3, 'MKT': 2, 'SYI': 8, 'DRO': 985, 'BWG': 308, 'IKK': 1, 'ULS': 1, 'TYS': 10273, 'AYS': 1, 'ASN': 32, 'ELN': 9, 'SVH': 2, 'PHL': 82498, 'LAL': 4, 'LSV': 176, 'EAT': 903, 'TUP': 307, 'CNW': 69, 'CWI': 2, 'NPA': 12, 'MOB': 3615, 'DWH': 1, 'FSM': 1339, 'PIH': 826, 'DVT': 9, 'MGW': 270, 'MZZ': 1, 'BFF': 446, 'BF