# 1. Scale up the BTS on-time data to include international flights. The BTS T100 data and On-Time data are available at:

In [9]:
import geopandas as gpd
import pandas as pd

In [10]:
# BTS T100 data, which includes total monthly arrival counts (both domestic and international) to each of the airport in 2023
# Data source: https://www.transtats.bts.gov/DL_SelectFields.aspx?gnoyr_VQ=FMG&QO_fu146_anzr=Nv4%20Pn44vr45 
T100_2023 = pd.read_csv('T_T100_all_2023.csv')

# Group by airport and month to get the monthly total arrival counts for each airport
monthly_arrivals_2023_all = T100_2023.groupby(['DEST', 'MONTH']).agg({'DEPARTURES_PERFORMED':'sum'}).reset_index()
monthly_arrivals_2023_all = monthly_arrivals_2023_all.rename(columns={'DEPARTURES_PERFORMED': 'total_arrivals'})

In [5]:
# Domestic arrivals
monthly_arrivals_2023_domestic = T100_2023[T100_2023.ORIGIN_COUNTRY_NAME == 'United States']
monthly_arrivals_2023_domestic = monthly_arrivals_2023_domestic.groupby(['DEST', 'MONTH']).agg({'DEPARTURES_PERFORMED':'sum'}).reset_index()
monthly_arrivals_2023_domestic = monthly_arrivals_2023_domestic.rename(columns={'DEPARTURES_PERFORMED': 'domestic_arrivals'})

In [None]:
# International arrival
monthly_arrivals_2023_international = T100_2023[T100_2023.ORIGIN_COUNTRY_NAME != 'United States']
monthly_arrivals_2023_international = monthly_arrivals_2023_international.groupby(['DEST', 'MONTH']).agg({'DEPARTURES_PERFORMED':'sum'}).reset_index()
monthly_arrivals_2023_international = monthly_arrivals_2023_international.rename(columns={'DEPARTURES_PERFORMED': 'international_arrivals'})

In [20]:
# Merge to get a df that shows the total arrivals, domestic arrivals, and international arrivals
monthly_arrivals_2023 = monthly_arrivals_2023_all.merge(monthly_arrivals_2023_domestic, on = ['DEST', 'MONTH'], how='outer').merge(monthly_arrivals_2023_international, on=['DEST', 'MONTH'], how='outer').fillna(0)

In [11]:
# BTS on-time data, which contains the detailed flight arrival info, but only includes domestic flights
all_arrivals_ontime_2023 = pd.read_csv('BTS_on_time_flight_data.csv')
monthly_arrivals_ontime_2023 = all_arrivals_ontime_2023.groupby(['Destination Airport', 'MONTH']).size().reset_index(name='Count')
monthly_arrivals_ontime_2023 = monthly_arrivals_ontime_2023.rename(columns={'Destination Airport': 'DEST'})

In [22]:
# Merge the two monthly arrival dfs
monthly_arrivals = monthly_arrivals_ontime_2023.merge(monthly_arrivals_2023, on = ['DEST', 'MONTH'], how = 'left')

# Get the difference between the two monthly arrival dfs
monthly_arrivals['domestic_diff'] = monthly_arrivals['domestic_arrivals'] - monthly_arrivals['Count']
monthly_arrivals['international_diff'] = monthly_arrivals['international_arrivals']

In [30]:
days_in_month = {
    1: 31, 2: 28, 3: 31, 4: 30,
    5: 31, 6: 30, 7: 31, 8: 31,
    9: 30, 10: 31, 11: 30, 12: 31
}

In [32]:
monthly_arrivals['days'] = monthly_arrivals['MONTH'].map(days_in_month)

In [42]:
# get flight arrival distribution
all_arrivals_ontime_2023['Actual Arrival Time'] = all_arrivals_ontime_2023['Actual Arrival Time'].replace('24:00', '00:00')
all_arrivals_ontime_2023['hour'] = all_arrivals_ontime_2023['Actual Arrival Time'].apply(lambda x: x.split(':')[0])
arrival_distribution = all_arrivals_ontime_2023.groupby(['Destination Airport', 'hour']).size().reset_index(name='Count')
total_arrival_per_airport = arrival_distribution.groupby('Destination Airport').agg({'Count': 'sum'}).reset_index()
total_arrival_per_airport = total_arrival_per_airport.rename(columns = {'Count': 'Total_count'})
arrival_distribution = arrival_distribution.merge(total_arrival_per_airport, on = 'Destination Airport', how = 'left')
arrival_distribution['arrival_probability'] = arrival_distribution['Count']/ arrival_distribution['Total_count']
arrival_distribution['hour'] = arrival_distribution['hour'].apply(lambda x: int(x))
arrival_distribution

Unnamed: 0,Destination Airport,hour,Count,Total_count,arrival_probability
0,ABE,0,153,4408,0.034710
1,ABE,1,28,4408,0.006352
2,ABE,2,10,4408,0.002269
3,ABE,3,1,4408,0.000227
4,ABE,8,3,4408,0.000681
...,...,...,...,...,...
6568,YUM,19,16,1740,0.009195
6569,YUM,20,51,1740,0.029310
6570,YUM,21,261,1740,0.150000
6571,YUM,22,114,1740,0.065517


In [45]:
# insert flights
import random
from datetime import datetime, timedelta
import numpy as np
from datetime import date

def random_time(df):
    hour = np.random.choice(df['hour'], p=df['arrival_probability'])  # choose hour (0-23) based on the distribution
    minute = random.randint(0, 59)  # Random minute (0-59)
    return f"{hour:02d}:{minute:02d}"  # Format as HH:MM

In [66]:
def insert_arrival(BTS_on_time_df, monthly_arrivals_df, arrival_distribution, airport):
    domes_add_list = []
    inter_add_list = []
    all_flights = pd.DataFrame()
    for i in range(len(monthly_arrivals_df)):
        month = monthly_arrivals_df['MONTH'].iloc[i]
        domestic_diff = monthly_arrivals_df['domestic_diff'].iloc[i]
        international_diff = monthly_arrivals_df['international_diff'].iloc[i]
        days = monthly_arrivals_df['days'].iloc[i]
    
        for i in range(int(domestic_diff)):
            day = random.randint(1, days)
            f_date = date(2023, month, day).strftime("%m/%d/%Y")
            arrival_time = random_time(arrival_distribution)
            domes_add_df = pd.DataFrame({'Carrier Code': ['DDD'], 'Date (MM/DD/YYYY)': [f_date], 'Flight Number': ['0000'],
                                        'Tail Number': ['OOOO'], 'Origin Airport': ['NNN'], 'Actual Arrival Time': [arrival_time],'Destination Airport': [airport]})
            domes_add_list.append(domes_add_df)
        for j in range(int(international_diff)):
            day = random.randint(1, days)
            f_date = date(2023, month, day).strftime("%m/%d/%Y")
            arrival_time = random_time(arrival_distribution)
            inter_add_df = pd.DataFrame({'Carrier Code': ['III'], 'Date (MM/DD/YYYY)': [f_date], 'Flight Number': ['0000'],
                                        'Tail Number': ['OOOO'], 'Origin Airport': ['NNN'], 'Actual Arrival Time': [arrival_time],'Destination Airport': [airport]})
            inter_add_list.append(inter_add_df)
    if len(domes_add_list)>0:
        domes_add = pd.concat(domes_add_list)
        all_flights = pd.concat([BTS_on_time_df, domes_add])
    else:
        all_flights = BTS_on_time_df
        
    if len(inter_add_list)>0:
        inter_add = pd.concat(inter_add_list)
        all_flights = pd.concat([all_flights, inter_add])

    all_flights = all_flights[['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number', 'Origin Airport', 'Actual Arrival Time','Destination Airport']]
    return all_flights

In [52]:
BTS_on_time_dfs = [group for _,group in all_arrivals_ontime_2023.groupby('Destination Airport')]
monthly_arrivals_dfs = [group for _,group in monthly_arrivals.groupby('DEST')]
arrival_distribution_dfs = [group for _,group in arrival_distribution.groupby('Destination Airport')]

In [None]:
# Loop through each flight to scale up the flight arrivals
for i in range(len(BTS_on_time_dfs)):
    BTS_on_time_df = BTS_on_time_dfs[i]
    airport = BTS_on_time_df['Destination Airport'].iloc[0]
    monthly_arrivals_df = next(df for df in monthly_arrivals_dfs if 'DEST' in df.columns and df['DEST'].astype(str).str.contains(airport, case=False, na=False).any())
    arrival_distribution = next(df for df in arrival_distribution_dfs if 'Destination Airport' in df.columns and df['Destination Airport'].astype(str).str.contains(airport, case=False, na=False).any())
    all_flights = insert_arrival(BTS_on_time_df, monthly_arrivals_df, arrival_distribution, airport)
    all_flights.to_csv(f'/all_flight_arrival_data/{airport}_all_flight_arrival_data.csv')
