<h1>Summarize Trajectories</h1>

In [1]:
from datetime import datetime, timedelta
from datetime import timedelta
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

In [2]:
def calculate_stat(
    dataframe: pd.DataFrame, 
    group_by_column: str, 
    target_column: str,
    stat_type: str
) -> pd.Series | None:
    group = dataframe.groupby(group_by_column)[target_column]
    if stat_type == 'count':
        return group.size()
    elif stat_type == 'mean':
        return group.mean()
    elif stat_type == 'std':
        return group.std()
    elif stat_type == 'min':
        return group.min()
    elif stat_type == '25percentile':
        return group.quantile(0.25)
    elif stat_type == 'median':
        return group.median()
    elif stat_type == '75percentile':
        return group.quantile(0.75)
    elif stat_type == 'max':
        return group.max()
    return None

In [3]:
def summarize_trajectories(
    trajectory_df: pd.DataFrame,
    columns: list = ['latitude', 'longitude', 'altitude', 'groundspeed', 'track', 'vertical_rate', 
                     'u_component_of_wind', 'v_component_of_wind', 'temperature', 'specific_humidity'],
    stat_types: list = ['count', 'mean', 'std', 'min', '25percentile', 'median', '75percentile', 'max']
) -> pd.DataFrame:
    stats_dict = {}
    for column in columns:
        for stat_type in stat_types:
            stat = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column=column, stat_type=stat_type)
            stats_dict[f"{column}_{stat_type}"] = stat
    trajactory_summary_df = pd.DataFrame(stats_dict)
    trajactory_summary_df = trajactory_summary_df.reset_index()
    trajactory_summary_df.rename(columns={'index': 'flight_id'}, inplace=True)
    trajactory_summary_df['flight_id'] = trajactory_summary_df['flight_id'].astype('int64')
    return trajactory_summary_df

In [4]:
summarized_trajectory_df = pd.DataFrame({})

flight_count = 0
start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 12, 31)

current_date = start_date
while current_date <= end_date:   
    dfs =  []
    today = current_date.strftime('%Y-%m-%d')
    print(today)
    today_trajectory_df = pq.ParquetDataset(f'../PRC_data/trajectory_files/{today}.parquet').read().to_pandas()
    flight_count += len(today_trajectory_df['flight_id'].unique())
    dfs.append(today_trajectory_df)
    
    yesterday = current_date - timedelta(days=1)
    if yesterday.year == 2022:
        yesterday_date = yesterday.strftime('%Y-%m-%d')
        yesterday_trajectory_df = pq.ParquetDataset(f'../PRC_data/trajectory_files/{yesterday_date}.parquet').read().to_pandas()
        filtered_yesterday_trajectory_df = yesterday_trajectory_df[yesterday_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_yesterday_trajectory_df)
        
    tomorrow = current_date + timedelta(days=1)
    if tomorrow.year == 2022:
        tomorrow_date = tomorrow.strftime('%Y-%m-%d')
        tomorrow_trajectory_df = pq.ParquetDataset(f'../PRC_data/trajectory_files/{tomorrow_date}.parquet').read().to_pandas()
        filtered_tomorrow_trajectory_df = tomorrow_trajectory_df[tomorrow_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_tomorrow_trajectory_df)
        
    trajectory_df = pd.concat(dfs, axis=0, ignore_index=True)
    sum_trajectory_df = summarize_trajectories(trajectory_df)
    summarized_trajectory_df = pd.concat([summarized_trajectory_df, sum_trajectory_df], ignore_index=True)

    current_date += timedelta(days=1)

print(f"{flight_count = }")
print(f"{len(summarized_trajectory_df) = }")
num_unique_flight_ids = len(summarized_trajectory_df['flight_id'].unique())
print(f"{num_unique_flight_ids = }")

2022-01-01
2022-01-02
2022-01-03
2022-01-04
2022-01-05
2022-01-06
2022-01-07
2022-01-08
2022-01-09
2022-01-10
2022-01-11
2022-01-12
2022-01-13
2022-01-14
2022-01-15
2022-01-16
2022-01-17
2022-01-18
2022-01-19
2022-01-20
2022-01-21
2022-01-22
2022-01-23
2022-01-24
2022-01-25
2022-01-26
2022-01-27
2022-01-28
2022-01-29
2022-01-30
2022-01-31
2022-02-01
2022-02-02
2022-02-03
2022-02-04
2022-02-05
2022-02-06
2022-02-07
2022-02-08
2022-02-09
2022-02-10
2022-02-11
2022-02-12
2022-02-13
2022-02-14
2022-02-15
2022-02-16
2022-02-17
2022-02-18
2022-02-19
2022-02-20
2022-02-21
2022-02-22
2022-02-23
2022-02-24
2022-02-25
2022-02-26
2022-02-27
2022-02-28
2022-03-01
2022-03-02
2022-03-03
2022-03-04
2022-03-05
2022-03-06
2022-03-07
2022-03-08
2022-03-09
2022-03-10
2022-03-11
2022-03-12
2022-03-13
2022-03-14
2022-03-15
2022-03-16
2022-03-17
2022-03-18
2022-03-19
2022-03-20
2022-03-21
2022-03-22
2022-03-23
2022-03-24
2022-03-25
2022-03-26
2022-03-27
2022-03-28
2022-03-29
2022-03-30
2022-03-31
2022-04-01

In [5]:
display(summarized_trajectory_df)

Unnamed: 0,flight_id,latitude_count,latitude_mean,latitude_std,latitude_min,latitude_25percentile,latitude_median,latitude_75percentile,latitude_max,longitude_count,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
0,248750643,11897,45.557291,2.243508,41.182526,43.813849,45.851440,47.226687,49.016327,11897,...,219.807752,287.429532,11897,0.000508,0.001359,0.000006,0.000014,0.000020,0.000039,0.007216
1,248750690,12906,51.337336,5.699563,41.182111,46.464821,51.788471,56.509117,59.666189,12906,...,224.325133,281.127092,12906,0.000304,0.000878,0.000005,0.000016,0.000023,0.000030,0.005494
2,248750693,3635,42.164756,0.460882,41.182530,41.861533,42.260447,42.584450,42.696831,3635,...,268.323309,283.739072,3635,0.001482,0.001644,0.000006,0.000339,0.000378,0.002362,0.005527
3,248750694,12413,49.952583,5.000874,41.180603,45.721653,50.156498,54.347855,57.641167,12413,...,218.413790,281.157377,12413,0.000390,0.001066,0.000003,0.000013,0.000020,0.000026,0.005519
4,248750710,7816,43.353786,0.918070,41.179871,42.793210,43.653488,43.882604,44.630264,7816,...,235.266630,287.651388,7816,0.000558,0.001281,0.000006,0.000008,0.000010,0.000192,0.005560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534112,258074448,4350,48.622138,1.429781,46.214676,47.201658,48.990051,50.048538,50.166275,4350,...,283.473741,289.544959,4350,0.003312,0.003246,0.000016,0.000168,0.002238,0.006575,0.007986
534113,258074468,4864,51.221165,1.907152,48.045319,49.457360,51.434117,53.038719,53.750198,4864,...,265.578487,288.535544,4864,0.001669,0.002543,0.000010,0.000048,0.000169,0.002414,0.008585
534114,258074471,10994,48.924019,4.449353,41.180466,45.049842,49.094398,52.838858,55.774283,10994,...,223.355169,282.302024,10994,0.000611,0.001518,0.000003,0.000031,0.000042,0.000074,0.006643
534115,258074485,2418,47.003726,0.433726,46.241409,46.580163,47.118530,47.448704,47.462893,2418,...,288.561217,288.884297,2418,0.004061,0.002374,0.000031,0.000985,0.004625,0.006626,0.007395


In [6]:
rows_with_nan = summarized_trajectory_df[summarized_trajectory_df.isna().any(axis=1)]
display(rows_with_nan)

Unnamed: 0,flight_id,latitude_count,latitude_mean,latitude_std,latitude_min,latitude_25percentile,latitude_median,latitude_75percentile,latitude_max,longitude_count,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
822,248766614,51,59.652301,0.000352,59.651344,59.652374,59.652480,59.652489,59.652489,51,...,276.846517,276.851431,51,0.004569,3.599425e-06,0.004560,0.004566,0.004570,0.004572,0.004573
2142,248787674,339,47.454234,0.000000,47.454234,47.454234,47.454234,47.454234,47.454234,339,...,228.069010,228.093259,339,0.000106,4.297146e-08,0.000106,0.000106,0.000106,0.000106,0.000106
2565,248794722,95,40.120342,0.000000,40.120342,40.120342,40.120342,40.120342,40.120342,95,...,277.421813,277.423265,95,0.003228,1.098788e-07,0.003228,0.003228,0.003228,0.003228,0.003228
2707,248797527,71,59.651520,0.000138,59.651184,59.651502,59.651585,59.651595,59.651596,71,...,277.610968,277.612875,71,0.004937,1.248260e-06,0.004935,0.004936,0.004937,0.004938,0.004939
3830,248818169,52,59.654789,0.000004,59.654784,59.654784,59.654789,59.654789,59.654796,52,...,273.313061,273.315197,52,0.003381,8.699041e-07,0.003379,0.003381,0.003381,0.003382,0.003383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520552,257801163,47,53.365436,0.001184,53.363075,53.364292,53.365821,53.366421,53.367359,47,...,286.098290,286.106048,47,0.007988,1.744494e-04,0.006819,0.008008,0.008012,0.008020,0.008026
525611,257892137,403,41.398367,0.023933,41.386429,41.386429,41.386429,41.390533,41.469406,403,...,279.286628,279.637837,403,0.002067,2.158582e-03,0.000036,0.000036,0.000036,0.004414,0.004556
529149,257971243,15,63.994772,0.000011,63.994766,63.994766,63.994766,63.994773,63.994803,15,...,271.578576,271.581487,15,0.002938,2.709342e-07,0.002938,0.002938,0.002938,0.002939,0.002939
531771,258020719,1,44.825649,,44.825649,44.825649,44.825649,44.825649,44.825649,1,...,286.233253,286.233253,1,0.005651,,0.005651,0.005651,0.005651,0.005651,0.005651


In [7]:
summarized_trajectory_df.to_csv("./data/summarized_trajectory.csv", index=False)
print(f"./data/summarized_trajectory.csv is saved!")

./data/summarized_trajectory.csv is saved!
