# OpenFlights Datasets

In [1]:
# If running in a fresh environment, uncomment the next line to install pandas
# %pip install -q pandas requests

import pandas as pd
from io import StringIO
import requests

AIRPORTS_URL = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"
AIRLINES_URL = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airlines.dat"
ROUTES_URL = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat"

# Explicit schemas based on OpenFlights documentation
airport_columns = [
    "airport_id", "name", "city", "country", "iata", "icao",
    "latitude", "longitude", "altitude", "timezone", "dst",
    "tz_database_time_zone", "type", "source"
]

airline_columns = [
    "airline_id", "name", "alias", "iata", "icao", "callsign",
    "country", "active"
]

route_columns = [
    "airline", "airline_id", "source_airport", "source_airport_id",
    "destination_airport", "destination_airport_id", "codeshare",
    "stops", "equipment"
]


def fetch_csv(url: str) -> str:
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    return response.text


def load_openflights_table(url: str, columns: list[str]) -> pd.DataFrame:
    raw_text = fetch_csv(url)
    # OpenFlights files are comma-separated but may include quoted fields
    df = pd.read_csv(StringIO(raw_text), header=None, names=columns)
    return df


airports_df = load_openflights_table(AIRPORTS_URL, airport_columns)
airlines_df = load_openflights_table(AIRLINES_URL, airline_columns)
routes_df = load_openflights_table(ROUTES_URL, route_columns)

# Basic type coercions for numeric-like columns
for col in ["airport_id", "altitude", "timezone"]:
    if col in airports_df.columns:
        airports_df[col] = pd.to_numeric(airports_df[col], errors="coerce")

for col in ["latitude", "longitude"]:
    if col in airports_df.columns:
        airports_df[col] = pd.to_numeric(airports_df[col], errors="coerce")

for col in ["airline_id"]:
    if col in airlines_df.columns:
        airlines_df[col] = pd.to_numeric(airlines_df[col], errors="coerce")

for col in ["stops"]:
    if col in routes_df.columns:
        routes_df[col] = pd.to_numeric(routes_df[col], errors="coerce")

airports.dat

Airport ID, Name, City, Country, IATA, ICAO,
Latitude, Longitude, Altitude, Timezone, DST,
Tz database time zone, Type, Source

In [2]:
airports_df

Unnamed: 0,airport_id,name,city,country,iata,icao,latitude,longitude,altitude,timezone,dst,tz_database_time_zone,type,source
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.081690,145.391998,5282,10.0,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.207080,145.789001,20,10.0,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.826790,144.296005,5388,10.0,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10.0,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.443380,147.220001,146,10.0,U,Pacific/Port_Moresby,airport,OurAirports
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7693,14106,Rogachyovo Air Base,Belaya,Russia,\N,ULDA,71.616699,52.478298,272,,\N,\N,airport,OurAirports
7694,14107,Ulan-Ude East Airport,Ulan Ude,Russia,\N,XIUW,51.849998,107.737999,1670,,\N,\N,airport,OurAirports
7695,14108,Krechevitsy Air Base,Novgorod,Russia,\N,ULLK,58.625000,31.385000,85,,\N,\N,airport,OurAirports
7696,14109,Desierto de Atacama Airport,Copiapo,Chile,CPO,SCAT,-27.261200,-70.779198,670,,\N,\N,airport,OurAirports


airlines.dat

Airline ID, Name, Alias, IATA, ICAO, Callsign, Country, Active

In [3]:
airlines_df

Unnamed: 0,airline_id,name,alias,iata,icao,callsign,country,active
0,-1,Unknown,\N,-,,\N,\N,Y
1,1,Private flight,\N,-,,,,Y
2,2,135 Airways,\N,,GNL,GENERAL,United States,N
3,3,1Time Airline,\N,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,\N,,WYT,,United Kingdom,N
...,...,...,...,...,...,...,...,...
6157,21248,GX Airlines,,,CBG,SPRAY,China,Y
6158,21251,Lynx Aviation (L3/SSX),,,SSX,Shasta,United States,N
6159,21268,Jetgo Australia,,JG,\N,,Australia,Y
6160,21270,Air Carnival,,2S,\N,,India,Y


routes.dat

Airline, Airline ID, Source airport, Source airport ID,
Destination airport, Destination airport ID, Codeshare,
Stops, Equipment

In [4]:
routes_df

Unnamed: 0,airline,airline_id,source_airport,source_airport_id,destination_airport,destination_airport_id,codeshare,stops,equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2
...,...,...,...,...,...,...,...,...,...
67658,ZL,4178,WYA,6334,ADL,3341,,0,SF3
67659,ZM,19016,DME,4029,FRU,2912,,0,734
67660,ZM,19016,FRU,2912,DME,4029,,0,734
67661,ZM,19016,FRU,2912,OSS,2913,,0,734


In [5]:
# Export dataframes to CSV files
from datetime import datetime
import os

# Ensure export directories exist (data folder is at same level as notebook folder)
os.makedirs('../data/raw', exist_ok=True)

# Create timestamp for filenames (not used in filenames to keep them stable)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Export each dataframe to separate CSV files
airports_df.to_csv('../data/raw/airports.csv', index=False, encoding='utf-8')
airlines_df.to_csv('../data/raw/airlines.csv', index=False, encoding='utf-8')
routes_df.to_csv('../data/raw/routes.csv', index=False, encoding='utf-8')


# Data Cleaning 

In [6]:
# FINAL DATA CLEANING PROCESS
# Create cleaned datasets from original data

# Airports cleaning
airports_cleaned = airports_df.copy()

# 1. Better handling of string values
airports_cleaned = airports_cleaned.replace({
    '\\N': pd.NA,
    'nan': pd.NA,
    'NaN': pd.NA,
    '': pd.NA,
    'Unknown': pd.NA,
    'unknown': pd.NA
})

# 2. Clean IATA and ICAO codes
airports_cleaned['iata'] = airports_cleaned['iata'].replace(['-', 'nan', 'NaN'], pd.NA)
airports_cleaned['icao'] = airports_cleaned['icao'].replace(['-', 'nan', 'NaN'], pd.NA)

# 3. Coordinate validation
airports_cleaned = airports_cleaned.dropna(subset=['latitude', 'longitude'])
airports_cleaned = airports_cleaned[
    (airports_cleaned['latitude'] >= -90) & (airports_cleaned['latitude'] <= 90) &
    (airports_cleaned['longitude'] >= -180) & (airports_cleaned['longitude'] <= 180)
]

# 4. Clean other columns
airports_cleaned['altitude'] = pd.to_numeric(airports_cleaned['altitude'], errors='coerce').fillna(0)
airports_cleaned['timezone'] = pd.to_numeric(airports_cleaned['timezone'], errors='coerce')
dst_mapping = {'E': 'E', 'A': 'A', 'S': 'S', 'O': 'O', 'Z': 'Z', 'N': 'N', 'U': 'U'}
airports_cleaned['dst'] = airports_cleaned['dst'].map(dst_mapping).fillna('U')
airports_cleaned['type'] = airports_cleaned['type'].fillna('airport')
airports_cleaned['source'] = airports_cleaned['source'].fillna('Unknown')
airports_cleaned = airports_cleaned.drop_duplicates(subset=['airport_id'], keep='first')

# Clean string columns
string_columns = ['name', 'city', 'country', 'iata', 'icao', 'tz_database_time_zone', 'type', 'source']
for col in string_columns:
    if col in airports_cleaned.columns:
        airports_cleaned[col] = airports_cleaned[col].astype(str).str.strip()
        airports_cleaned[col] = airports_cleaned[col].replace('nan', pd.NA)

print("Airports cleaned successfully!")
print(f"Airports: {len(airports_cleaned)} rows")


Airports cleaned successfully!
Airports: 7698 rows


In [7]:
# Airlines cleaning
airlines_cleaned = airlines_df.copy()

# 1. Better handling of string values
airlines_cleaned = airlines_cleaned.replace({
    '\\N': pd.NA,
    'nan': pd.NA,
    'NaN': pd.NA,
    '': pd.NA,
    'Unknown': pd.NA,
    'unknown': pd.NA,
    '-': pd.NA
})

# 2. Clean airline_id - remove invalid IDs
airlines_cleaned['airline_id'] = pd.to_numeric(airlines_cleaned['airline_id'], errors='coerce')
airlines_cleaned = airlines_cleaned[airlines_cleaned['airline_id'] > 0]

# 3. Clean IATA and ICAO codes
airlines_cleaned['iata'] = airlines_cleaned['iata'].replace(['-', 'nan', 'NaN'], pd.NA)
airlines_cleaned['icao'] = airlines_cleaned['icao'].replace(['-', 'nan', 'NaN'], pd.NA)

# 4. Remove airlines without any valid codes
airlines_cleaned = airlines_cleaned[
    ~(airlines_cleaned['iata'].isna() & airlines_cleaned['icao'].isna())
]

# 5. Clean other columns
airlines_cleaned['callsign'] = airlines_cleaned['callsign'].replace(['-', 'nan', 'NaN'], pd.NA)
airlines_cleaned['active'] = airlines_cleaned['active'].replace(['\\N', 'nan', 'NaN'], 'N').fillna('N')
airlines_cleaned['country'] = airlines_cleaned['country'].fillna('Unknown')
airlines_cleaned = airlines_cleaned.drop_duplicates(subset=['airline_id'], keep='first')

# Clean string columns
string_columns = ['name', 'alias', 'iata', 'icao', 'callsign', 'country']
for col in string_columns:
    if col in airlines_cleaned.columns:
        airlines_cleaned[col] = airlines_cleaned[col].astype(str).str.strip()
        airlines_cleaned[col] = airlines_cleaned[col].replace('nan', pd.NA)

print("Airlines cleaned successfully!")
print(f"Airlines: {len(airlines_cleaned)} rows")


Airlines cleaned successfully!
Airlines: 6159 rows


In [8]:
# Routes cleaning
routes_cleaned = routes_df.copy()

# 1. Better handling of string values
routes_cleaned = routes_cleaned.replace({
    '\\N': pd.NA,
    'nan': pd.NA,
    'NaN': pd.NA,
    '': pd.NA,
    'Unknown': pd.NA,
    'unknown': pd.NA,
    '-': pd.NA
})

# 2. Clean airline_id - remove invalid IDs
routes_cleaned['airline_id'] = pd.to_numeric(routes_cleaned['airline_id'], errors='coerce')
routes_cleaned = routes_cleaned[routes_cleaned['airline_id'] > 0]

# 3. Clean airport IDs
routes_cleaned['source_airport_id'] = pd.to_numeric(routes_cleaned['source_airport_id'], errors='coerce')
routes_cleaned['destination_airport_id'] = pd.to_numeric(routes_cleaned['destination_airport_id'], errors='coerce')

# 4. Remove routes with missing critical information
routes_cleaned = routes_cleaned.dropna(subset=['source_airport', 'destination_airport'])
routes_cleaned = routes_cleaned.dropna(subset=['source_airport_id', 'destination_airport_id'])

# 5. Clean other columns
routes_cleaned['stops'] = pd.to_numeric(routes_cleaned['stops'], errors='coerce').fillna(0)
routes_cleaned['codeshare'] = routes_cleaned['codeshare'].fillna('N')
routes_cleaned['equipment'] = routes_cleaned['equipment'].fillna('Unknown')

# 6. Remove routes where source and destination are the same
routes_cleaned = routes_cleaned[
    routes_cleaned['source_airport'] != routes_cleaned['destination_airport']
]

# 7. Remove duplicate routes
routes_cleaned = routes_cleaned.drop_duplicates(
    subset=['airline_id', 'source_airport_id', 'destination_airport_id'], 
    keep='first'
)

# 8. Validate references
valid_airline_ids = set(airlines_cleaned['airline_id'].dropna())
valid_airport_ids = set(airports_cleaned['airport_id'].dropna())
routes_cleaned = routes_cleaned[routes_cleaned['airline_id'].isin(valid_airline_ids)]
routes_cleaned = routes_cleaned[routes_cleaned['source_airport_id'].isin(valid_airport_ids)]
routes_cleaned = routes_cleaned[routes_cleaned['destination_airport_id'].isin(valid_airport_ids)]

# Clean string columns
string_columns = ['airline', 'source_airport', 'destination_airport', 'codeshare', 'equipment']
for col in string_columns:
    if col in routes_cleaned.columns:
        routes_cleaned[col] = routes_cleaned[col].astype(str).str.strip()
        routes_cleaned[col] = routes_cleaned[col].replace('nan', pd.NA)

print("Routes cleaned successfully!")
print(f"Routes: {len(routes_cleaned)} rows")


Routes cleaned successfully!
Routes: 66315 rows


In [9]:
# EXPORT FINAL CLEANED DATASETS
from datetime import datetime
import os

# Ensure export directory exists (data folder is at same level as notebook folder)
os.makedirs('../data/cleaned', exist_ok=True)

# Create timestamp for filenames (not used in filenames to keep them stable)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Export final cleaned dataframes to separate CSV files
airports_cleaned.to_csv('../data/cleaned/airports_cleaned.csv', index=False, encoding='utf-8')
airlines_cleaned.to_csv('../data/cleaned/airlines_cleaned.csv', index=False, encoding='utf-8')
routes_cleaned.to_csv('../data/cleaned/routes_cleaned.csv', index=False, encoding='utf-8')

# Flight Route Analysis & Visualization


In [10]:

# %pip install -q networkx streamlit folium streamlit-folium matplotlib

import networkx as nx
import matplotlib.pyplot as plt
import math
from typing import Tuple, Dict, Any


In [11]:
# Distance calculation functions
def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """Calculate the great circle distance between two points on Earth."""
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.asin(math.sqrt(a))
    
    # Radius of earth in kilometers
    r = 6371
    return c * r

def calculate_route_distances(routes_df, airports_df):
    """Calculate distances for all routes using airport coordinates."""
    # Create lookup dictionaries for coordinates
    airport_coords = {}
    for _, row in airports_df.iterrows():
        airport_coords[row['airport_id']] = (row['latitude'], row['longitude'])
    
    # Calculate distances
    distances = []
    for _, route in routes_df.iterrows():
        source_id = route['source_airport_id']
        dest_id = route['destination_airport_id']
        
        if source_id in airport_coords and dest_id in airport_coords:
            lat1, lon1 = airport_coords[source_id]
            lat2, lon2 = airport_coords[dest_id]
            distance = haversine_distance(lat1, lon1, lat2, lon2)
            distances.append(distance)
        else:
            distances.append(None)  # Missing coordinate data
    
    routes_with_distance = routes_df.copy()
    routes_with_distance['distance_km'] = distances
    
    return routes_with_distance

print("Distance calculation functions defined!")


Distance calculation functions defined!


In [12]:
# Calculate distances for all routes
print("Calculating route distances...")
routes_with_distance = calculate_route_distances(routes_cleaned, airports_cleaned)

print(f"Routes with distance calculated: {len(routes_with_distance)}")
print(f"Routes with valid distance: {routes_with_distance['distance_km'].notna().sum()}")

# Show sample of routes with distances
print("\nSample routes with distances:")
routes_with_distance[['source_airport', 'destination_airport', 'distance_km']].head(10)


Calculating route distances...
Routes with distance calculated: 66315
Routes with valid distance: 66315

Sample routes with distances:


Unnamed: 0,source_airport,destination_airport,distance_km
0,AER,KZN,1506.825604
1,ASF,KZN,1040.43832
2,ASF,MRV,448.164909
3,CEK,KZN,770.5085
4,CEK,OVB,1338.631467
5,DME,KZN,715.64935
6,DME,NBC,892.382788
8,DME,UUA,951.432198
9,EGO,KGD,1171.881495
10,EGO,KZN,1008.25311


In [13]:
# Build flight network graph
print("Building flight network graph...")

# Create NetworkX graph
G = nx.Graph()

# Add nodes (airports)
for _, airport in airports_cleaned.iterrows():
    G.add_node(
        airport['airport_id'],
        iata=airport.get('iata', ''),
        name=airport.get('name', ''),
        city=airport.get('city', ''),
        country=airport.get('country', ''),
        latitude=airport.get('latitude', 0),
        longitude=airport.get('longitude', 0)
    )

# Add edges (routes)
for _, route in routes_with_distance.iterrows():
    if pd.notna(route.get('distance_km')):
        G.add_edge(
            route['source_airport_id'],
            route['destination_airport_id'],
            distance=route['distance_km'],
            airline_id=route.get('airline_id', ''),
            stops=route.get('stops', 0)
        )

print(f"Graph created with {G.number_of_nodes()} airports and {G.number_of_edges()} routes")
print(f"Network density: {nx.density(G):.4f}")
print(f"Is connected: {nx.is_connected(G)}")


Building flight network graph...
Graph created with 7698 airports and 18679 routes
Network density: 0.0006
Is connected: False


In [14]:
# Flight route analysis functions
def find_shortest_path(G, airports_df, source_iata, dest_iata):
    """Find shortest path between two airports."""
    # Find airport IDs from IATA codes
    source_airport = airports_df[airports_df['iata'] == source_iata]
    dest_airport = airports_df[airports_df['iata'] == dest_iata]
    
    if source_airport.empty or dest_airport.empty:
        return {"error": "Airport not found"}
    
    source_id = source_airport.iloc[0]['airport_id']
    dest_id = dest_airport.iloc[0]['airport_id']
    
    try:
        # Find shortest path
        path = nx.shortest_path(G, source_id, dest_id, weight='distance')
        
        # Calculate total distance
        total_distance = 0
        legs = []
        
        for i in range(len(path) - 1):
            edge_data = G[path[i]][path[i+1]]
            distance = edge_data.get('distance', 0)
            total_distance += distance
            
            # Get airport info
            source_airport_info = airports_df[airports_df['airport_id'] == path[i]].iloc[0]
            dest_airport_info = airports_df[airports_df['airport_id'] == path[i+1]].iloc[0]
            
            legs.append({
                "from": source_airport_info.get('iata', ''),
                "to": dest_airport_info.get('iata', ''),
                "distance_km": round(distance, 2)
            })
        
        # Get IATA codes for path
        path_iata = []
        for airport_id in path:
            airport_info = airports_df[airports_df['airport_id'] == airport_id].iloc[0]
            path_iata.append(airport_info.get('iata', ''))
        
        return {
            "source": source_iata,
            "destination": dest_iata,
            "path": path_iata,
            "total_distance_km": round(total_distance, 2),
            "legs": legs,
            "num_stops": len(path) - 2
        }
        
    except nx.NetworkXNoPath:
        return {"error": "No path found between airports"}

def analyze_hubs(G, airports_df, country=None, top_n=10):
    """Analyze airport hubs using centrality measures."""
    # Filter airports by country if specified
    airports_to_analyze = airports_df
    if country:
        airports_to_analyze = airports_df[
            airports_df['country'].str.contains(country, case=False, na=False)
        ]
    
    # Calculate centrality measures
    degree_centrality = nx.degree_centrality(G)
    betweenness_centrality = nx.betweenness_centrality(G, weight='distance')
    
    # Get top hubs
    top_hubs = []
    for airport_id in airports_to_analyze['airport_id']:
        if airport_id in G.nodes:
            airport_info = airports_to_analyze[airports_to_analyze['airport_id'] == airport_id].iloc[0]
            top_hubs.append({
                "airport": airport_info.get('iata', ''),
                "name": airport_info.get('name', ''),
                "city": airport_info.get('city', ''),
                "country": airport_info.get('country', ''),
                "degree_centrality": round(degree_centrality.get(airport_id, 0), 3),
                "betweenness": round(betweenness_centrality.get(airport_id, 0), 3)
            })
    
    # Sort by degree centrality
    top_hubs.sort(key=lambda x: x['degree_centrality'], reverse=True)
    
    return {
        "country": country or "Global",
        "top_hubs": top_hubs[:top_n],
        "total_airports_analyzed": len(airports_to_analyze)
    }

print("Flight analysis functions defined!")


Flight analysis functions defined!


In [15]:
# Example: Find shortest route from Ho Chi Minh City to London
print("=== SHORTEST ROUTE ANALYSIS ===")
route_result = find_shortest_path(G, airports_cleaned, 'SGN', 'LHR')

if 'error' in route_result:
    print(f"Error: {route_result['error']}")
else:
    print("Shortest Route Found:")
    print(f"From: {route_result['source']}")
    print(f"To: {route_result['destination']}")
    print(f"Route: {' → '.join(route_result['path'])}")
    print(f"Total Distance: {route_result['total_distance_km']} km")
    print(f"Number of Stops: {route_result['num_stops']}")
    print("\nRoute Legs:")
    for i, leg in enumerate(route_result['legs'], 1):
        print(f"  {i}. {leg['from']} → {leg['to']} ({leg['distance_km']} km)")


=== SHORTEST ROUTE ANALYSIS ===
Shortest Route Found:
From: SGN
To: LHR
Route: SGN → DME → LHR
Total Distance: 10229.11 km
Number of Stops: 1

Route Legs:
  1. SGN → DME (7684.06 km)
  2. DME → LHR (2545.05 km)


In [16]:
# Hub Analysis
print("=== HUB ANALYSIS ===")

# Analyze global hubs
global_hubs = analyze_hubs(G, airports_cleaned, top_n=15)
print(f"\nTop {len(global_hubs['top_hubs'])} Global Hubs:")
for i, hub in enumerate(global_hubs['top_hubs'][:10], 1):
    print(f"{i:2d}. {hub['airport']} - {hub['name']} ({hub['city']}, {hub['country']})")
    print(f"     Degree Centrality: {hub['degree_centrality']}, Betweenness: {hub['betweenness']}")

# Analyze hubs in Vietnam
vietnam_hubs = analyze_hubs(G, airports_cleaned, country='Vietnam', top_n=10)
print(f"\nTop Hubs in Vietnam:")
for i, hub in enumerate(vietnam_hubs['top_hubs'], 1):
    print(f"{i}. {hub['airport']} - {hub['name']} ({hub['city']})")
    print(f"   Degree Centrality: {hub['degree_centrality']}, Betweenness: {hub['betweenness']}")


=== HUB ANALYSIS ===

Top 15 Global Hubs:
 1. FRA - Frankfurt am Main Airport (Frankfurt, Germany)
     Degree Centrality: 0.032, Betweenness: 0.003
 2. AMS - Amsterdam Airport Schiphol (Amsterdam, Netherlands)
     Degree Centrality: 0.032, Betweenness: 0.004
 3. CDG - Charles de Gaulle International Airport (Paris, France)
     Degree Centrality: 0.031, Betweenness: 0.003
 4. ISL - Atatürk International Airport (Istanbul, Turkey)
     Degree Centrality: 0.03, Betweenness: 0.007
 5. ATL - Hartsfield Jackson Atlanta International Airport (Atlanta, United States)
     Degree Centrality: 0.028, Betweenness: 0.004
 6. PEK - Beijing Capital International Airport (Beijing, China)
     Degree Centrality: 0.027, Betweenness: 0.01
 7. ORD - Chicago O'Hare International Airport (Chicago, United States)
     Degree Centrality: 0.027, Betweenness: 0.006
 8. MUC - Munich Airport (Munich, Germany)
     Degree Centrality: 0.025, Betweenness: 0.001
 9. DXB - Dubai International Airport (Dubai, United

In [17]:
# Export routes with distances for Streamlit app
print("=== EXPORTING ROUTES WITH DISTANCES ===")

# Ensure export directory exists (data folder is at same level as notebook folder)
import os
os.makedirs('../data/cleaned', exist_ok=True)

# Export routes with distances
routes_with_distance.to_csv('../data/cleaned/routes_graph.csv', index=False, encoding='utf-8')
print("- routes_graph_ready.csv (with distances)")

print(f"Routes with distances: {routes_with_distance['distance_km'].notna().sum()} rows")

=== EXPORTING ROUTES WITH DISTANCES ===
- routes_graph_ready.csv (with distances)
Routes with distances: 66315 rows


# Network Visualization Export for Gephi


In [18]:
# Install networkx if not already installed
# %pip install -q networkx

import networkx as nx
import os
from pathlib import Path


In [19]:
# Create comprehensive network graph for Gephi visualization
print("=== CREATING COMPREHENSIVE NETWORK GRAPH ===")

# Create directed graph (flights have direction)
G_gephi = nx.DiGraph()

# Add nodes (airports) with comprehensive attributes
print("Adding airport nodes...")
for _, airport in airports_cleaned.iterrows():
    G_gephi.add_node(
        airport['airport_id'],
        label=airport.get('iata', ''),
        name=airport.get('name', ''),
        city=airport.get('city', ''),
        country=airport.get('country', ''),
        latitude=float(airport.get('latitude', 0)),
        longitude=float(airport.get('longitude', 0)),
        altitude=float(airport.get('altitude', 0)),
        timezone=float(airport.get('timezone', 0)) if pd.notna(airport.get('timezone')) else 0,
        type=airport.get('type', 'airport'),
        source=airport.get('source', 'Unknown')
    )

print(f"Added {G_gephi.number_of_nodes()} airport nodes")

# Add edges (routes) with comprehensive attributes
print("Adding route edges...")
edge_count = 0
for _, route in routes_with_distance.iterrows():
    if pd.notna(route.get('distance_km')):
        G_gephi.add_edge(
            route['source_airport_id'],
            route['destination_airport_id'],
            weight=float(route['distance_km']),
            distance_km=float(route['distance_km']),
            airline_id=int(route.get('airline_id', 0)) if pd.notna(route.get('airline_id')) else 0,
            stops=int(route.get('stops', 0)) if pd.notna(route.get('stops')) else 0,
            codeshare=route.get('codeshare', 'N'),
            equipment=route.get('equipment', 'Unknown')
        )
        edge_count += 1

print(f"Added {edge_count} route edges")
print(f"Total graph: {G_gephi.number_of_nodes()} nodes, {G_gephi.number_of_edges()} edges")


=== CREATING COMPREHENSIVE NETWORK GRAPH ===
Adding airport nodes...
Added 7698 airport nodes
Adding route edges...
Added 66315 route edges
Total graph: 7698 nodes, 36588 edges


In [20]:
# Calculate network metrics for visualization
print("=== CALCULATING NETWORK METRICS ===")

# Calculate centrality measures
print("Calculating centrality measures...")
degree_centrality = nx.degree_centrality(G_gephi)
betweenness_centrality = nx.betweenness_centrality(G_gephi, weight='weight')
closeness_centrality = nx.closeness_centrality(G_gephi, distance='weight')
pagerank = nx.pagerank(G_gephi, weight='weight')

# Add centrality measures to nodes
for node in G_gephi.nodes():
    G_gephi.nodes[node]['degree_centrality'] = degree_centrality.get(node, 0)
    G_gephi.nodes[node]['betweenness_centrality'] = betweenness_centrality.get(node, 0)
    G_gephi.nodes[node]['closeness_centrality'] = closeness_centrality.get(node, 0)
    G_gephi.nodes[node]['pagerank'] = pagerank.get(node, 0)

# Calculate basic network statistics
print("Calculating network statistics...")
stats = {
    'total_nodes': G_gephi.number_of_nodes(),
    'total_edges': G_gephi.number_of_edges(),
    'density': nx.density(G_gephi),
    'is_strongly_connected': nx.is_strongly_connected(G_gephi),
    'is_weakly_connected': nx.is_weakly_connected(G_gephi),
    'number_of_strongly_connected_components': nx.number_strongly_connected_components(G_gephi),
    'number_of_weakly_connected_components': nx.number_weakly_connected_components(G_gephi)
}

print("Network Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

# Find top hubs
top_hubs = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
print(f"\nTop 10 Hubs by Degree Centrality:")
for i, (node, centrality) in enumerate(top_hubs, 1):
    airport_info = airports_cleaned[airports_cleaned['airport_id'] == node]
    if not airport_info.empty:
        iata = airport_info.iloc[0]['iata']
        name = airport_info.iloc[0]['name']
        city = airport_info.iloc[0]['city']
        country = airport_info.iloc[0]['country']
        print(f"  {i:2d}. {iata} - {name} ({city}, {country}) - Centrality: {centrality:.4f}")


=== CALCULATING NETWORK METRICS ===
Calculating centrality measures...
Calculating network statistics...
Network Statistics:
  total_nodes: 7698
  total_edges: 36588
  density: 0.0006175032918150637
  is_strongly_connected: False
  is_weakly_connected: False
  number_of_strongly_connected_components: 4607
  number_of_weakly_connected_components: 4568

Top 10 Hubs by Degree Centrality:
   1. FRA - Frankfurt am Main Airport (Frankfurt, Germany) - Centrality: 0.0620
   2. CDG - Charles de Gaulle International Airport (Paris, France) - Centrality: 0.0611
   3. AMS - Amsterdam Airport Schiphol (Amsterdam, Netherlands) - Centrality: 0.0602
   4. ISL - Atatürk International Airport (Istanbul, Turkey) - Centrality: 0.0586
   5. ATL - Hartsfield Jackson Atlanta International Airport (Atlanta, United States) - Centrality: 0.0563
   6. ORD - Chicago O'Hare International Airport (Chicago, United States) - Centrality: 0.0531
   7. PEK - Beijing Capital International Airport (Beijing, China) - Centr

In [21]:
# Export multiple network views to Gephi format (.gexf)
print("=== EXPORTING MULTIPLE NETWORK VIEWS TO GEPHI ===")

# Ensure export directory exists
os.makedirs('../data/gephi', exist_ok=True)

# Clean the graph for Gephi export - remove pandas NA values
print("Cleaning graph attributes for Gephi compatibility...")

def clean_value(value):
    """Convert pandas NA values to None for Gephi compatibility"""
    if pd.isna(value) or value is pd.NA:
        return None
    return value

def create_clean_graph(G):
    """Create a clean copy of graph for Gephi export"""
    G_clean = nx.DiGraph()
    
    # Add nodes with cleaned attributes
    for node, attrs in G.nodes(data=True):
        clean_attrs = {}
        for key, value in attrs.items():
            clean_value_result = clean_value(value)
            if clean_value_result is not None:
                clean_attrs[key] = clean_value_result
        G_clean.add_node(node, **clean_attrs)

    # Add edges with cleaned attributes
    for u, v, attrs in G.edges(data=True):
        clean_attrs = {}
        for key, value in attrs.items():
            clean_value_result = clean_value(value)
            if clean_value_result is not None:
                clean_attrs[key] = clean_value_result
        G_clean.add_edge(u, v, **clean_attrs)
    
    return G_clean

# 1. FULL NETWORK (original)
print("1. Creating full network...")
G_full = create_clean_graph(G_gephi)
nx.write_gexf(G_full, '../data/gephi/flight_network_full.gexf')
print(f"   Full network: {G_full.number_of_nodes()} nodes, {G_full.number_of_edges()} edges")

# 2. MAJOR HUBS NETWORK (top 20% by degree centrality)
print("2. Creating major hubs network...")
top_20_percent = int(len(degree_centrality) * 0.2)
top_hubs_nodes = [node for node, _ in sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:top_20_percent]]
G_major_hubs = G_gephi.subgraph(top_hubs_nodes)
G_major_hubs_clean = create_clean_graph(G_major_hubs)
nx.write_gexf(G_major_hubs_clean, '../data/gephi/flight_network_major_hubs.gexf')
print(f"   Major hubs: {G_major_hubs_clean.number_of_nodes()} nodes, {G_major_hubs_clean.number_of_edges()} edges")

# 3. LONG DISTANCE NETWORK (routes > 5000km)
print("3. Creating long-distance network...")
long_distance_edges = [(u, v) for u, v, d in G_gephi.edges(data=True) if d.get('distance_km', 0) > 5000]
G_long_distance = G_gephi.edge_subgraph(long_distance_edges)
G_long_distance_clean = create_clean_graph(G_long_distance)
nx.write_gexf(G_long_distance_clean, '../data/gephi/flight_network_long_distance.gexf')
print(f"   Long distance: {G_long_distance_clean.number_of_nodes()} nodes, {G_long_distance_clean.number_of_edges()} edges")

# 4. INTERNATIONAL NETWORK (cross-border routes)
print("4. Creating international network...")
international_edges = []
for u, v, d in G_gephi.edges(data=True):
    u_country = G_gephi.nodes[u].get('country', '')
    v_country = G_gephi.nodes[v].get('country', '')
    if u_country != v_country and u_country != '' and v_country != '':
        international_edges.append((u, v))

G_international = G_gephi.edge_subgraph(international_edges)
G_international_clean = create_clean_graph(G_international)
nx.write_gexf(G_international_clean, '../data/gephi/flight_network_international.gexf')
print(f"   International: {G_international_clean.number_of_nodes()} nodes, {G_international_clean.number_of_edges()} edges")

# 5. COUNTRY-LEVEL NETWORK
print("5. Creating country-level network...")
# Group airports by country
country_airports = {}
for _, airport in airports_cleaned.iterrows():
    country = airport.get('country', 'Unknown')
    if country not in country_airports:
        country_airports[country] = []
    country_airports[country].append(airport['airport_id'])

# Create country-to-country network
G_country = nx.DiGraph()

# Add country nodes
for country in country_airports.keys():
    airport_count = len(country_airports[country])
    G_country.add_node(country, 
                      airport_count=airport_count,
                      label=country)

# Add country-to-country edges based on routes
country_routes = {}
for _, route in routes_with_distance.iterrows():
    if pd.notna(route.get('distance_km')):
        source_airport_id = route['source_airport_id']
        dest_airport_id = route['destination_airport_id']
        
        # Find countries for source and destination airports
        source_country = None
        dest_country = None
        
        for country, airports in country_airports.items():
            if source_airport_id in airports:
                source_country = country
            if dest_airport_id in airports:
                dest_country = country
        
        if source_country and dest_country and source_country != dest_country:
            route_key = (source_country, dest_country)
            if route_key not in country_routes:
                country_routes[route_key] = {
                    'route_count': 0,
                    'total_distance': 0,
                    'avg_distance': 0
                }
            country_routes[route_key]['route_count'] += 1
            country_routes[route_key]['total_distance'] += route['distance_km']

# Add edges to country network
for (source_country, dest_country), data in country_routes.items():
    avg_distance = data['total_distance'] / data['route_count']
    G_country.add_edge(source_country, dest_country,
                      route_count=data['route_count'],
                      total_distance=data['total_distance'],
                      avg_distance=avg_distance,
                      weight=avg_distance)

G_country_clean = create_clean_graph(G_country)
nx.write_gexf(G_country_clean, '../data/gephi/flight_network_country_level.gexf')
print(f"   Country level: {G_country_clean.number_of_nodes()} countries, {G_country_clean.number_of_edges()} connections")

print("\n=== NETWORK VIEWS===")
print("- flight_network_full.gexf (complete network)")
print("- flight_network_major_hubs.gexf (top 20% hubs)")
print("- flight_network_long_distance.gexf (routes > 5000km)")
print("- flight_network_international.gexf (cross-border routes)")
print("- flight_network_country_level.gexf (country aggregation)")


=== EXPORTING MULTIPLE NETWORK VIEWS TO GEPHI ===
Cleaning graph attributes for Gephi compatibility...
1. Creating full network...
   Full network: 7698 nodes, 36588 edges
2. Creating major hubs network...
   Major hubs: 1539 nodes, 31899 edges
3. Creating long-distance network...
   Long distance: 308 nodes, 2550 edges
4. Creating international network...
   International: 1169 nodes, 19461 edges
5. Creating country-level network...
   Country level: 237 countries, 4553 connections

=== NETWORK VIEWS===
- flight_network_full.gexf (complete network)
- flight_network_major_hubs.gexf (top 20% hubs)
- flight_network_long_distance.gexf (routes > 5000km)
- flight_network_international.gexf (cross-border routes)
- flight_network_country_level.gexf (country aggregation)


In [22]:
# Show network summary statistics
print("=== NETWORK SUMMARY ===")
print(f"Full network: {G_full.number_of_nodes()} airports, {G_full.number_of_edges()} routes")
print(f"Major hubs: {G_major_hubs_clean.number_of_nodes()} airports, {G_major_hubs_clean.number_of_edges()} routes")
print(f"Long distance: {G_long_distance_clean.number_of_nodes()} airports, {G_long_distance_clean.number_of_edges()} routes")
print(f"International: {G_international_clean.number_of_nodes()} airports, {G_international_clean.number_of_edges()} routes")
print(f"Country level: {G_country_clean.number_of_nodes()} countries, {G_country_clean.number_of_edges()} connections")

# Show top hubs
print(f"\nTop 10 Hubs by Degree Centrality:")
for i, (node, centrality) in enumerate(top_hubs[:10], 1):
    airport_info = airports_cleaned[airports_cleaned['airport_id'] == node]
    if not airport_info.empty:
        iata = airport_info.iloc[0]['iata']
        name = airport_info.iloc[0]['name']
        city = airport_info.iloc[0]['city']
        country = airport_info.iloc[0]['country']
        print(f"  {i:2d}. {iata} - {name} ({city}, {country}) - Centrality: {centrality:.4f}")

print(f"\n=== GEPHI ANALYSIS ===")
print("1. flight_network_major_hubs.gexf - Best for hub analysis (manageable size)")
print("2. flight_network_international.gexf - Best for global connectivity")
print("3. flight_network_long_distance.gexf - Best for long-haul routes")
print("4. flight_network_country_level.gexf - Best for geopolitical analysis")
print("5. flight_network_full.gexf - Complete overview (may be too dense)")

=== NETWORK SUMMARY ===
Full network: 7698 airports, 36588 routes
Major hubs: 1539 airports, 31899 routes
Long distance: 308 airports, 2550 routes
International: 1169 airports, 19461 routes
Country level: 237 countries, 4553 connections

Top 10 Hubs by Degree Centrality:
   1. FRA - Frankfurt am Main Airport (Frankfurt, Germany) - Centrality: 0.0620
   2. CDG - Charles de Gaulle International Airport (Paris, France) - Centrality: 0.0611
   3. AMS - Amsterdam Airport Schiphol (Amsterdam, Netherlands) - Centrality: 0.0602
   4. ISL - Atatürk International Airport (Istanbul, Turkey) - Centrality: 0.0586
   5. ATL - Hartsfield Jackson Atlanta International Airport (Atlanta, United States) - Centrality: 0.0563
   6. ORD - Chicago O'Hare International Airport (Chicago, United States) - Centrality: 0.0531
   7. PEK - Beijing Capital International Airport (Beijing, China) - Centrality: 0.0530
   8. MUC - Munich Airport (Munich, Germany) - Centrality: 0.0494
   9. DME - Domodedovo International