<h1>Summarize Trajectories</h1>

In [1]:
from datetime import datetime, timedelta
from datetime import timedelta
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

In [2]:
def calculate_stat(
    dataframe: pd.DataFrame, 
    group_by_column: str, 
    target_column: str,
    stat_type: str
) -> pd.Series | None:
    group = dataframe.groupby(group_by_column)[target_column]
    if stat_type == 'count':
        return group.size()
    elif stat_type == 'mean':
        return group.mean()
    elif stat_type == 'std':
        return group.std()
    elif stat_type == 'min':
        return group.min()
    elif stat_type == '25percentile':
        return group.quantile(0.25)
    elif stat_type == 'median':
        return group.median()
    elif stat_type == '75percentile':
        return group.quantile(0.75)
    elif stat_type == 'max':
        return group.max()
    return None

In [3]:
def summarize_trajectories(
    trajectory_df: pd.DataFrame,
    columns: list = ['latitude', 'longitude', 'altitude', 'groundspeed', 'track', 'vertical_rate', 'track_unwrapped', 
                     'u_component_of_wind', 'v_component_of_wind', 'temperature', 'specific_humidity'],
    stat_types: list = ['count', 'mean', 'std', 'min', '25percentile', 'median', '75percentile', 'max']
) -> pd.DataFrame:
    stats_dict = {}
    for column in columns:
        for stat_type in stat_types:
            stat = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column=column, stat_type=stat_type)
            stats_dict[f"{column}_{stat_type}"] = stat
    trajactory_summary_df = pd.DataFrame(stats_dict)
    trajactory_summary_df = trajactory_summary_df.reset_index()
    trajactory_summary_df.rename(columns={'index': 'flight_id'}, inplace=True)
    trajactory_summary_df['flight_id'] = trajactory_summary_df['flight_id'].astype('int64')
    return trajactory_summary_df

In [4]:
summarized_trajectory_df = pd.DataFrame({})

flight_count = 0
start_date = datetime(2022, 12, 29)
end_date = datetime(2022, 12, 31)

current_date = start_date
while current_date <= end_date:
    print(current_date.day)
    
    dfs =  []
    today = current_date.strftime('%Y-%m-%d')
    print(today)
    today_trajectory_df = pq.ParquetDataset(f'../../data/trajectory_files/{today}.parquet').read().to_pandas()
    flight_count += len(today_trajectory_df)
    dfs.append(today_trajectory_df)
    
    yesterday = current_date - timedelta(days=1)
    if yesterday.year == 2022:
        yesterday_date = yesterday.strftime('%Y-%m-%d')
        yesterday_trajectory_df = pq.ParquetDataset(f'../../data/trajectory_files/{yesterday_date}.parquet').read().to_pandas()
        filtered_yesterday_trajectory_df = yesterday_trajectory_df[yesterday_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_yesterday_trajectory_df)
        
    tomorrow = current_date + timedelta(days=1)
    if tomorrow.year == 2022:
        tomorrow_date = tomorrow.strftime('%Y-%m-%d')
        tomorrow_trajectory_df = pq.ParquetDataset(f'../../data/trajectory_files/{tomorrow_date}.parquet').read().to_pandas()
        filtered_tomorrow_trajectory_df = tomorrow_trajectory_df[tomorrow_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_tomorrow_trajectory_df)
        
    trajectory_df = pd.concat(dfs, axis=0, ignore_index=True)
    sum_trajectory_df = summarize_trajectories(trajectory_df)
    summarized_trajectory_df = pd.concat([summarized_trajectory_df, sum_trajectory_df], ignore_index=True)

    current_date += timedelta(days=1)

print(f"{len(summarized_trajectory_df) = }")
print(f"{flight_count = }")
num_unique_flight_ids = len(summarized_trajectory_df['flight_id'].unique())
print(f"{num_unique_flight_ids = }")

29
2022-12-29
30
2022-12-30
31
2022-12-31
len(summarized_trajectory_df) = 1629
flight_count = 6333637
num_unique_flight_ids = 1629


In [5]:
display(summarized_trajectory_df)

Unnamed: 0,flight_id,latitude_count,latitude_mean,latitude_std,latitude_min,latitude_25percentile,latitude_median,latitude_75percentile,latitude_max,longitude_count,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
0,258009563,2184,27.581898,1.206430,25.802276,26.482196,27.532310,28.629410,29.742085,2184,...,262.276641,292.919485,2184,0.000826,0.002296,0.000007,0.000007,0.000016,0.000049,0.010095
1,258009607,1195,33.763410,0.231193,33.431024,33.531018,33.775589,34.038483,34.038483,1195,...,270.654946,285.894446,1195,0.001432,0.002499,0.000025,0.000025,0.000188,0.000659,0.007710
2,258009746,1061,42.234848,0.094817,41.969284,42.220692,42.267710,42.303864,42.314960,1061,...,278.047206,282.850549,1061,0.001546,0.001423,0.000025,0.000062,0.001752,0.002779,0.004357
3,258009778,1215,19.575715,0.450352,18.780197,19.168722,19.556036,20.094818,20.117131,1215,...,269.919760,286.014343,1215,0.000308,0.000197,0.000068,0.000168,0.000341,0.000375,0.001168
4,258009780,1624,26.887714,0.867309,25.775555,26.075752,26.747716,27.636978,28.559509,1624,...,273.725776,292.260020,1624,0.000956,0.002432,0.000007,0.000014,0.000039,0.000077,0.010055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1624,258074448,7558,49.223089,1.291969,46.215013,48.471525,50.040115,50.048544,50.166181,7558,...,281.082665,289.378697,7558,0.003192,0.002883,0.000094,0.000180,0.003627,0.006256,0.007964
1625,258074468,4253,50.458568,1.738630,48.045685,48.711331,50.487808,52.026781,53.340066,4253,...,253.362757,288.535544,4253,0.001002,0.001745,0.000042,0.000047,0.000084,0.001021,0.006175
1626,258074471,11267,52.059969,2.705614,46.191605,50.166023,52.475451,54.261009,55.773880,11267,...,232.848971,282.304036,11267,0.000785,0.001668,0.000025,0.000041,0.000043,0.000212,0.006643
1627,258074485,5389,47.123686,0.396687,46.241409,46.797242,47.347303,47.448718,47.462723,5389,...,288.622599,288.884297,5389,0.004383,0.002391,0.000795,0.001400,0.005033,0.006630,0.007395


In [6]:
rows_with_nan = summarized_trajectory_df[summarized_trajectory_df.isna().any(axis=1)]
display(rows_with_nan)

Unnamed: 0,flight_id,latitude_count,latitude_mean,latitude_std,latitude_min,latitude_25percentile,latitude_median,latitude_75percentile,latitude_max,longitude_count,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
90,258017340,767,50.040946,0.002196372,50.033915,50.039995,50.042301,50.04237,50.04237,767,...,284.558079,284.565883,767,0.006746,1.607227e-05,0.006719,0.006731,0.006744,0.006764,0.006769
319,258021351,82,55.625723,0.0,55.625723,55.625723,55.625723,55.625723,55.625723,82,...,212.651574,212.653051,82,9e-06,8.791139e-09,9e-06,9e-06,9e-06,9e-06,9e-06
407,258027604,250,50.90292,0.0,50.90292,50.90292,50.90292,50.90292,50.90292,250,...,273.057644,273.058667,250,0.003506,4.412785e-07,0.003506,0.003506,0.003506,0.003507,0.003507
414,258027782,401,47.4543,2.396784e-07,47.4543,47.4543,47.4543,47.4543,47.454301,401,...,281.670189,281.684823,401,0.004918,1.551734e-06,0.004916,0.004917,0.004918,0.00492,0.004921
538,258029999,41,50.903709,0.0,50.903709,50.903709,50.903709,50.903709,50.903709,41,...,215.918794,215.921045,41,6e-06,4.137512e-10,6e-06,6e-06,6e-06,6e-06,6e-06
539,258030002,478,47.455767,0.001306516,47.4543,47.455155,47.4552,47.455685,47.458331,478,...,281.316677,281.340614,478,0.004951,2.413306e-06,0.004947,0.00495,0.004952,0.004953,0.004956
636,258034468,54,50.049172,0.0,50.049172,50.049172,50.049172,50.049172,50.049172,54,...,210.070997,210.071934,54,2e-05,4.970187e-09,2e-05,2e-05,2e-05,2e-05,2e-05
784,258037701,166,47.454967,0.0,47.454967,47.454967,47.454967,47.454967,47.454967,166,...,255.492845,279.705773,166,0.000613,0.001234511,3.3e-05,3.3e-05,6.8e-05,0.00022,0.004475
1549,258071059,652,50.045792,0.0,50.045792,50.045792,50.045792,50.045792,50.045792,652,...,,,652,,,,,,,
1611,258073815,143,47.45411,5.019029e-06,47.454105,47.454105,47.454108,47.454117,47.454117,143,...,287.749333,287.756651,143,0.006292,4.122408e-06,0.006285,0.006288,0.006292,0.006295,0.006299


In [7]:
summarized_trajectory_df.to_csv("summarized_trajectory.csv", index=False)
print(f"summarized_trajectory.csv is saved!")

summarized_trajectory.csv is saved!
