In [1]:
import pandas as pd 
import glob
import os 
#this block of code takes the csv generated from Flight_ETL notebook
# And then grabs the latest one that was saved to the file path
data= "flight_data/raw/"
csv_files = glob.glob(os.path.join(data, "flight_arrivals_*.csv"))

latest_file = max(csv_files,key=os.path.getmtime)

flight_df = pd.read_csv(latest_file)
flight_df.head()

Unnamed: 0,flight_date,weekday,arrival_iataCode,arrival_terminal,arrival_scheduledTime,departure_iataCode,airline_name,flight_iataNumber,aircraft_modelText
0,2025-08-13,3,mci,,19:46,iad,sa avianca,av2238,embraer e175lr
1,2025-08-13,3,mci,,19:46,iad,copa airlines,cm2606,embraer e175lr
2,2025-08-13,3,mci,,19:46,iad,lufthansa,lh9386,embraer e175lr
3,2025-08-13,3,mci,,19:46,iad,brussels airlines,sn8963,embraer e175lr
4,2025-08-13,3,mci,,19:46,iad,united express,ua6126,embraer e175lr


In [2]:
flight_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1711 entries, 0 to 1710
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   flight_date            1711 non-null   object
 1   weekday                1711 non-null   int64 
 2   arrival_iataCode       1711 non-null   object
 3   arrival_terminal       24 non-null     object
 4   arrival_scheduledTime  1707 non-null   object
 5   departure_iataCode     1711 non-null   object
 6   airline_name           1711 non-null   object
 7   flight_iataNumber      1711 non-null   object
 8   aircraft_modelText     1690 non-null   object
dtypes: int64(1), object(8)
memory usage: 120.4+ KB


In [3]:
#A quick view of different airplane to see what we are work with
sorted(flight_df['airline_name'].dropna().unique())

['aer lingus',
 'aeromexico',
 'air canada',
 'air france',
 'air new zealand',
 'air tahiti nui',
 'alaska airlines',
 'alaska airlines (uncf livery)',
 'alaska horizon',
 'allegiant air',
 'alphasky',
 'amazon air',
 'american airlines',
 'american eagle',
 'ana',
 'british airways',
 'brussels airlines',
 'china eastern airlines',
 'copa airlines',
 'delta air lines',
 'delta connection',
 'dhl',
 'emirates',
 'fedex',
 'finnair',
 'frontier (blanco the polar bear livery)',
 'frontier (luis the axolotl livery)',
 'frontier (luna and lilly the wolves livery)',
 'hawaiian airlines',
 'iberia',
 'jet linx aviation',
 'kenya airways',
 'klm',
 'korean air',
 'latam airlines',
 'lufthansa',
 'malaysia airlines',
 'qantas',
 'qatar airways',
 'royal air maroc',
 'royal jordanian',
 'sa avianca',
 'sas',
 'southwest airlines',
 'southwest airlines (illinois one livery)',
 'southwest airlines (maryland one livery)',
 'southwest airlines (new mexico one livery)',
 'southwest airlines (triple

In [4]:
# there's 3 different options domestic, international, and cargo we won't need cargo to predict demand for uber
domestic_airlines = [
    'alaska airlines', 'alaska horizon', 'allegiant air', 'american airlines',
    'delta air lines', 'delta connection', 'frontier (flint the wood stork livery)',
    'hawaiian airlines', 'jet linx aviation', 'southwest airlines',
    'spirit airlines', 'united airlines', 'united express'
]

international_airlines = [
    'aeromexico', 'air canada', 'air france', 'air new zealand', 'air tahiti nui',
    'ana', 'british airways', 'brussels airlines', 'china eastern airlines',
    'copa airlines', 'emirates', 'finnair', 'iberia', 'kenya airways', 'klm',
    'korean air', 'latam airlines', 'lufthansa', 'malaysia airlines', 'qantas',
    'qatar airways', 'sa avianca', 'sas', 'turkish airlines', 'virgin atlantic',
    'volaris', 'westjet'
]

cargo_airlines = ['amazon air', 'dhl', 'fedex', 'ups', 'alphasky']

In [5]:
# this function takes the list that was above and classifys if an air line is domestic, international or Cargo
def classify_airline(airline):
    airline = airline.lower()
    if airline in domestic_airlines:
        return 'Domestic'
    elif airline in international_airlines:
        return 'International'
    elif airline in cargo_airlines:
        return 'Cargo'
    else:
        return 'Unknown'

flight_df['airline_type'] = flight_df['airline_name'].apply(classify_airline)
flight_df.head()

Unnamed: 0,flight_date,weekday,arrival_iataCode,arrival_terminal,arrival_scheduledTime,departure_iataCode,airline_name,flight_iataNumber,aircraft_modelText,airline_type
0,2025-08-13,3,mci,,19:46,iad,sa avianca,av2238,embraer e175lr,International
1,2025-08-13,3,mci,,19:46,iad,copa airlines,cm2606,embraer e175lr,International
2,2025-08-13,3,mci,,19:46,iad,lufthansa,lh9386,embraer e175lr,International
3,2025-08-13,3,mci,,19:46,iad,brussels airlines,sn8963,embraer e175lr,International
4,2025-08-13,3,mci,,19:46,iad,united express,ua6126,embraer e175lr,Domestic


In [6]:
#then we get rid of cargo for a new dataframe
flight_df_clean = flight_df[flight_df['airline_type'] != 'Cargo']
flight_df_clean = flight_df_clean.dropna(subset=['arrival_scheduledTime'])
flight_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1629 entries, 0 to 1710
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   flight_date            1629 non-null   object
 1   weekday                1629 non-null   int64 
 2   arrival_iataCode       1629 non-null   object
 3   arrival_terminal       24 non-null     object
 4   arrival_scheduledTime  1629 non-null   object
 5   departure_iataCode     1629 non-null   object
 6   airline_name           1629 non-null   object
 7   flight_iataNumber      1629 non-null   object
 8   aircraft_modelText     1609 non-null   object
 9   airline_type           1629 non-null   object
dtypes: int64(1), object(9)
memory usage: 140.0+ KB


In [7]:
#this is tranforming the schedule landing time into minutes and then into an number so that it can be run through the algorithm
flight_df_clean = flight_df_clean.copy()

flight_df_clean['arrival_time'] = pd.to_datetime(flight_df_clean['arrival_scheduledTime'],format='%H:%M', errors='coerce')
flight_df_clean['arrival_minutes'] = (flight_df_clean['arrival_time'].dt.hour * 60 + flight_df_clean['arrival_time'].dt.minute)

In [8]:
#Here we create another function, this is going to a sign a cluster based of density 
from sklearn.cluster import MeanShift

def run_mean_shift(flight_df_clean, bandwidth, label):
    clustered = []

    for date, group in flight_df_clean.groupby('flight_date'):
        group = group.copy()
        X = group[['arrival_minutes']].values

        if len(X) < 2:
            group[f'cluster_{label}'] = -1
        else:
            ms = MeanShift(bandwidth=bandwidth)
            group[f'cluster_{label}'] = ms.fit_predict(X)

        clustered.append(group)

    return pd.concat(clustered).reset_index(drop=True)

In [9]:
# we run two different cluster window's one with 90 minutes and the other 
df = run_mean_shift(flight_df_clean, bandwidth=90, label='90min')
df = run_mean_shift(df, bandwidth=180, label='180min')

[WinError 2] The system cannot find the file specified
  File "c:\Users\brave\miniconda3\envs\dsa-core\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\brave\miniconda3\envs\dsa-core\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\brave\miniconda3\envs\dsa-core\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\brave\miniconda3\envs\dsa-core\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [10]:
df.head()

Unnamed: 0,flight_date,weekday,arrival_iataCode,arrival_terminal,arrival_scheduledTime,departure_iataCode,airline_name,flight_iataNumber,aircraft_modelText,airline_type,arrival_time,arrival_minutes,cluster_90min,cluster_180min
0,2025-08-13,3,mci,,19:46,iad,sa avianca,av2238,embraer e175lr,International,1900-01-01 19:46:00,1186,2,0
1,2025-08-13,3,mci,,19:46,iad,copa airlines,cm2606,embraer e175lr,International,1900-01-01 19:46:00,1186,2,0
2,2025-08-13,3,mci,,19:46,iad,lufthansa,lh9386,embraer e175lr,International,1900-01-01 19:46:00,1186,2,0
3,2025-08-13,3,mci,,19:46,iad,brussels airlines,sn8963,embraer e175lr,International,1900-01-01 19:46:00,1186,2,0
4,2025-08-13,3,mci,,19:46,iad,united express,ua6126,embraer e175lr,Domestic,1900-01-01 19:46:00,1186,2,0


In [None]:
# components/flight_timelines.py
import pandas as pd
import plotly.express as px

def generate_flight_timelines(df: pd.DataFrame, source_90min: str = "flight_90min"):
    """
    Generate two timeline plots for flights: one for 90-minute clusters, one for others.
    
    Args:
        df (pd.DataFrame): Flight data with start_time, end_time, flight_count, source, rank, flight_date
        source_90min (str): Label used to identify 90-minute clusters
    
    Returns:
        tuple: (timeline_90min_fig, timeline_180min_fig)
    """
    # Parse timestamps
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['end_time'] = pd.to_datetime(df['end_time'])

    # Tooltip
    df["tooltip"] = "Flights: " + df["flight_count"].astype(str)

    # Split by source
    flight_90 = df[df['source'] == source_90min]
    flight_180 = df[df['source'] != source_90min]

    # 90-Minute Timeline
    fig_90 = px.timeline(
        flight_90,
        x_start="start_time",
        x_end="end_time",
        color="rank",
        y="flight_date",
        hover_data=["tooltip"],
        title="📅 Flight Timeline (90-Minute Clusters)"
    )
    fig_90.update_yaxes(autorange="reversed")

    # 180-Minute Timeline
    fig_180 = px.timeline(
        flight_180,
        x_start="start_time",
        x_end="end_time",
        color="rank",
        y="flight_date",
        hover_data=["tooltip"],
        title="📅 Flight Timeline (3-Hour Clusters)"
    )
    fig_180.update_yaxes(autorange="reversed")

    return fig_90, fig_180

In [12]:
#taking the summarization funcation and applying it to the dataframe
top_90 = summarize_clusters(df, cluster_col='cluster_90min', top_n=3, source='flight_90min')
top_180 = summarize_clusters(df, cluster_col='cluster_180min', top_n=3, source='flight_180min')

In [13]:
#then we take the to clusters dataframe the we summarized and we append the data
all_top_clusters = pd.concat([top_90, top_180], ignore_index=True)
all_top_clusters.head()

Unnamed: 0,flight_date,cluster_id,arrival_iataCode,start_min,end_min,flight_count,start_time,end_time,duration_min,midpoint_min,midpoint_time,rank,source
0,2025-08-13,0,mci,915,1012,30,15:15,16:52,97,963,16:03,3.0,flight_90min
1,2025-08-13,1,mci,1365,1439,47,22:45,23:59,74,1402,23:22,1.0,flight_90min
2,2025-08-13,5,mci,781,892,31,13:01,14:52,111,836,13:56,2.0,flight_90min
3,2025-08-14,1,mci,1325,1439,49,22:05,23:59,114,1382,23:02,2.0,flight_90min
4,2025-08-14,2,mci,795,995,48,13:15,16:35,200,895,14:55,3.0,flight_90min


In [14]:
all_top_clusters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   flight_date       42 non-null     object 
 1   cluster_id        42 non-null     int64  
 2   arrival_iataCode  42 non-null     object 
 3   start_min         42 non-null     int32  
 4   end_min           42 non-null     int32  
 5   flight_count      42 non-null     int64  
 6   start_time        42 non-null     object 
 7   end_time          42 non-null     object 
 8   duration_min      42 non-null     int32  
 9   midpoint_min      42 non-null     int32  
 10  midpoint_time     42 non-null     object 
 11  rank              42 non-null     float64
 12  source            42 non-null     object 
dtypes: float64(1), int32(4), int64(2), object(6)
memory usage: 3.7+ KB


In [15]:
# we then write the data to a csv so it can be later ingested into a dashboard
from datetime import datetime, timedelta 
timestamp = datetime.now().strftime('%Y%m%d')
raw_path = f"flight_data/modeled/Top_clusters{timestamp}.csv"
all_top_clusters.to_csv(raw_path, index=False)