<h1>Summarize Trajectories</h1>

In [1]:
from datetime import datetime, timedelta
from datetime import timedelta
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

In [2]:
def calculate_stat(
    dataframe: pd.DataFrame, 
    group_by_column: str, 
    target_column: str,
    stat_type: str
) -> pd.Series | None:
    group = dataframe.groupby(group_by_column)[target_column]
    if stat_type == 'count':
        return group.size()
    elif stat_type == 'mean':
        return group.mean()
    elif stat_type == 'std':
        return group.std()
    elif stat_type == 'min':
        return group.min()
    elif stat_type == '25percentile':
        return group.quantile(0.25)
    elif stat_type == 'median':
        return group.median()
    elif stat_type == '75percentile':
        return group.quantile(0.75)
    elif stat_type == 'max':
        return group.max()
    return None

In [3]:
def summarize_trajectories(
    trajectory_df: pd.DataFrame,
    columns: list = ['latitude', 'longitude', 'altitude', 'groundspeed', 'track', 'vertical_rate', 'track_unwrapped', 
                     'u_component_of_wind', 'v_component_of_wind', 'temperature', 'specific_humidity'],
    stat_types: list = ['count', 'mean', 'std', 'min', '25percentile', 'median', '75percentile', 'max']
) -> pd.DataFrame:
    stats_dict = {}
    for column in columns:
        for stat_type in stat_types:
            stat = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column=column, stat_type=stat_type)
            stats_dict[f"{column}_{stat_type}"] = stat
    trajactory_summary_df = pd.DataFrame(stats_dict)
    trajactory_summary_df = trajactory_summary_df.reset_index()
    trajactory_summary_df.rename(columns={'index': 'flight_id'}, inplace=True)
    trajactory_summary_df['flight_id'] = trajactory_summary_df['flight_id'].astype('int64')
    return trajactory_summary_df

In [4]:
summarized_trajectory_df = pd.DataFrame({})

flight_count = 0
start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 12, 31)

current_date = start_date
while current_date <= end_date:   
    dfs =  []
    today = current_date.strftime('%Y-%m-%d')
    print(today)
    today_trajectory_df = pq.ParquetDataset(f'../../data/trajectory_files/{today}.parquet').read().to_pandas()
    flight_count += len(today_trajectory_df['flight_id'].unique())
    dfs.append(today_trajectory_df)
    
    yesterday = current_date - timedelta(days=1)
    if yesterday.year == 2022:
        yesterday_date = yesterday.strftime('%Y-%m-%d')
        yesterday_trajectory_df = pq.ParquetDataset(f'../../data/trajectory_files/{yesterday_date}.parquet').read().to_pandas()
        filtered_yesterday_trajectory_df = yesterday_trajectory_df[yesterday_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_yesterday_trajectory_df)
        
    tomorrow = current_date + timedelta(days=1)
    if tomorrow.year == 2022:
        tomorrow_date = tomorrow.strftime('%Y-%m-%d')
        tomorrow_trajectory_df = pq.ParquetDataset(f'../../data/trajectory_files/{tomorrow_date}.parquet').read().to_pandas()
        filtered_tomorrow_trajectory_df = tomorrow_trajectory_df[tomorrow_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_tomorrow_trajectory_df)
        
    trajectory_df = pd.concat(dfs, axis=0, ignore_index=True)
    sum_trajectory_df = summarize_trajectories(trajectory_df)
    summarized_trajectory_df = pd.concat([summarized_trajectory_df, sum_trajectory_df], ignore_index=True)

    current_date += timedelta(days=1)

print(f"{flight_count = }")
print(f"{len(summarized_trajectory_df) = }")
num_unique_flight_ids = len(summarized_trajectory_df['flight_id'].unique())
print(f"{num_unique_flight_ids = }")

2022-01-01
2022-01-02
2022-01-03
2022-01-04
2022-01-05
2022-01-06
2022-01-07
2022-01-08
2022-01-09
2022-01-10
2022-01-11
2022-01-12
2022-01-13
2022-01-14
2022-01-15
2022-01-16
2022-01-17
2022-01-18
2022-01-19
2022-01-20
2022-01-21
2022-01-22
2022-01-23
2022-01-24
2022-01-25
2022-01-26
2022-01-27
2022-01-28
2022-01-29
2022-01-30
2022-01-31
2022-02-01
2022-02-02
2022-02-03
2022-02-04
2022-02-05
2022-02-06
2022-02-07
2022-02-08
2022-02-09
2022-02-10
2022-02-11
2022-02-12
2022-02-13
2022-02-14
2022-02-15
2022-02-16
2022-02-17
2022-02-18
2022-02-19
2022-02-20
2022-02-21
2022-02-22
2022-02-23
2022-02-24
2022-02-25
2022-02-26
2022-02-27
2022-02-28
2022-03-01
2022-03-02
2022-03-03
2022-03-04
2022-03-05
2022-03-06
2022-03-07
2022-03-08
2022-03-09
2022-03-10
2022-03-11
2022-03-12
2022-03-13
2022-03-14
2022-03-15
2022-03-16
2022-03-17
2022-03-18
2022-03-19
2022-03-20
2022-03-21
2022-03-22
2022-03-23
2022-03-24
2022-03-25
2022-03-26
2022-03-27
2022-03-28
2022-03-29
2022-03-30
2022-03-31
2022-04-01

In [5]:
display(summarized_trajectory_df)

Unnamed: 0,flight_id,latitude_count,latitude_mean,latitude_std,latitude_min,latitude_25percentile,latitude_median,latitude_75percentile,latitude_max,longitude_count,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
0,248750643,12035,45.596246,2.260098,41.182530,43.844193,45.885005,47.238280,49.016190,12035,...,219.712078,287.131383,12035,0.000442,0.001200,0.000007,0.000014,0.000020,0.000037,0.006870
1,248750690,13221,51.409931,5.713993,41.182111,46.618528,51.802734,56.632324,59.664150,13221,...,223.945757,281.052105,13221,0.000275,0.000834,0.000005,0.000016,0.000023,0.000030,0.005494
2,248750693,4157,42.233283,0.463609,41.182530,41.870945,42.362164,42.680466,42.696602,4157,...,276.694464,283.748128,4157,0.001663,0.001763,0.000025,0.000338,0.000515,0.003216,0.005527
3,248750694,12414,49.952683,5.000696,41.180603,45.722211,50.157103,54.347557,57.641167,12414,...,218.413675,281.157377,12414,0.000389,0.001066,0.000008,0.000013,0.000020,0.000026,0.005519
4,248750710,7465,43.338510,0.930033,41.179877,42.743134,43.645615,43.919724,44.626648,7465,...,239.224293,287.651388,7465,0.000572,0.001286,0.000006,0.000008,0.000011,0.000279,0.005560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527158,258074448,7558,49.223089,1.291969,46.215013,48.471525,50.040115,50.048544,50.166181,7558,...,281.082665,289.378697,7558,0.003192,0.002883,0.000094,0.000180,0.003627,0.006256,0.007964
527159,258074468,4253,50.458568,1.738630,48.045685,48.711331,50.487808,52.026781,53.340066,4253,...,253.362757,288.535544,4253,0.001002,0.001745,0.000042,0.000047,0.000084,0.001021,0.006175
527160,258074471,11267,52.059969,2.705614,46.191605,50.166023,52.475451,54.261009,55.773880,11267,...,232.848971,282.304036,11267,0.000785,0.001668,0.000025,0.000041,0.000043,0.000212,0.006643
527161,258074485,5389,47.123686,0.396687,46.241409,46.797242,47.347303,47.448718,47.462723,5389,...,288.622599,288.884297,5389,0.004383,0.002391,0.000795,0.001400,0.005033,0.006630,0.007395


In [6]:
rows_with_nan = summarized_trajectory_df[summarized_trajectory_df.isna().any(axis=1)]
display(rows_with_nan)

Unnamed: 0,flight_id,latitude_count,latitude_mean,latitude_std,latitude_min,latitude_25percentile,latitude_median,latitude_75percentile,latitude_max,longitude_count,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
813,248766614,251,59.652118,0.000419,59.651344,59.651717,59.652374,59.652489,59.652489,251,...,276.838646,276.851431,251,0.004566,3.654817e-06,0.004560,0.004563,0.004566,0.004569,0.004573
2126,248787674,339,47.454234,0.000000,47.454234,47.454234,47.454234,47.454234,47.454234,339,...,228.069010,228.093259,339,0.000106,4.297146e-08,0.000106,0.000106,0.000106,0.000106,0.000106
2546,248794722,95,40.120342,0.000000,40.120342,40.120342,40.120342,40.120342,40.120342,95,...,277.421813,277.423265,95,0.003228,1.098788e-07,0.003228,0.003228,0.003228,0.003228,0.003228
2689,248797527,181,59.651527,0.000119,59.651184,59.651528,59.651585,59.651595,59.651595,181,...,277.611044,277.612875,181,0.004937,1.179041e-06,0.004935,0.004936,0.004937,0.004938,0.004939
3045,248804053,175,59.652279,0.000119,59.651905,59.652278,59.652334,59.652352,59.652352,175,...,213.301502,276.546987,175,0.000321,1.163885e-03,0.000006,0.000006,0.000006,0.000006,0.004598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526073,258030002,478,47.455767,0.001307,47.454300,47.455155,47.455200,47.455685,47.458331,478,...,281.316677,281.340614,478,0.004951,2.413306e-06,0.004947,0.004950,0.004952,0.004953,0.004956
526170,258034468,54,50.049172,0.000000,50.049172,50.049172,50.049172,50.049172,50.049172,54,...,210.070997,210.071934,54,0.000020,4.970187e-09,0.000020,0.000020,0.000020,0.000020,0.000020
526318,258037701,166,47.454967,0.000000,47.454967,47.454967,47.454967,47.454967,47.454967,166,...,255.492845,279.705773,166,0.000613,1.234511e-03,0.000033,0.000033,0.000068,0.000220,0.004475
527083,258071059,652,50.045792,0.000000,50.045792,50.045792,50.045792,50.045792,50.045792,652,...,,,652,,,,,,,


In [7]:
summarized_trajectory_df.to_csv("summarized_trajectory.csv", index=False)
print(f"summarized_trajectory.csv is saved!")

summarized_trajectory.csv is saved!
