<h1>Summarize Trajectories</h1>

In [1]:
from datetime import datetime, timedelta
from datetime import timedelta
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

In [2]:
def calculate_stat(
    dataframe: pd.DataFrame, 
    group_by_column: str, 
    target_column: str,
    stat_type: str
) -> pd.Series | None:
    group = dataframe.groupby(group_by_column)[target_column]
    if stat_type == 'count':
        return group.size()
    elif stat_type == 'mean':
        return group.mean()
    elif stat_type == 'std':
        return group.std()
    elif stat_type == 'min':
        return group.min()
    elif stat_type == '25percentile':
        return group.quantile(0.25)
    elif stat_type == 'median':
        return group.median()
    elif stat_type == '75percentile':
        return group.quantile(0.75)
    elif stat_type == 'max':
        return group.max()
    return None

In [3]:
def summarize_trajectories(
    trajectory_df: pd.DataFrame,
    columns: list = ['latitude', 'longitude', 'altitude', 'groundspeed', 'track', 'vertical_rate', 
                     'u_component_of_wind', 'v_component_of_wind', 'temperature', 'specific_humidity'],
    stat_types: list = ['count', 'mean', 'std', 'min', '25percentile', 'median', '75percentile', 'max']
) -> pd.DataFrame:
    stats_dict = {}
    for column in columns:
        for stat_type in stat_types:
            stat = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column=column, stat_type=stat_type)
            stats_dict[f"{column}_{stat_type}"] = stat
    trajactory_summary_df = pd.DataFrame(stats_dict)
    trajactory_summary_df = trajactory_summary_df.reset_index()
    trajactory_summary_df.rename(columns={'index': 'flight_id'}, inplace=True)
    trajactory_summary_df['flight_id'] = trajactory_summary_df['flight_id'].astype('int64')
    return trajactory_summary_df

In [4]:
summarized_trajectory_df = pd.DataFrame({})

flight_count = 0
start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 12, 31)

current_date = start_date
while current_date <= end_date:   
    dfs =  []
    today = current_date.strftime('%Y-%m-%d')
    print(today)
    today_trajectory_df = pq.ParquetDataset(f'../PRC_data/trajectory_files/{today}.parquet').read().to_pandas()
    flight_count += len(today_trajectory_df['flight_id'].unique())
    dfs.append(today_trajectory_df)
    
    yesterday = current_date - timedelta(days=1)
    if yesterday.year == 2022:
        yesterday_date = yesterday.strftime('%Y-%m-%d')
        yesterday_trajectory_df = pq.ParquetDataset(f'../PRC_data/trajectory_files/{yesterday_date}.parquet').read().to_pandas()
        filtered_yesterday_trajectory_df = yesterday_trajectory_df[yesterday_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_yesterday_trajectory_df)
        
    tomorrow = current_date + timedelta(days=1)
    if tomorrow.year == 2022:
        tomorrow_date = tomorrow.strftime('%Y-%m-%d')
        tomorrow_trajectory_df = pq.ParquetDataset(f'../PRC_data/trajectory_files/{tomorrow_date}.parquet').read().to_pandas()
        filtered_tomorrow_trajectory_df = tomorrow_trajectory_df[tomorrow_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_tomorrow_trajectory_df)
        
    trajectory_df = pd.concat(dfs, axis=0, ignore_index=True)
    sum_trajectory_df = summarize_trajectories(trajectory_df)
    summarized_trajectory_df = pd.concat([summarized_trajectory_df, sum_trajectory_df], ignore_index=True)

    current_date += timedelta(days=1)

print(f"{flight_count = }")
print(f"{len(summarized_trajectory_df) = }")
num_unique_flight_ids = len(summarized_trajectory_df['flight_id'].unique())
print(f"{num_unique_flight_ids = }")

2022-01-01
2022-01-02
2022-01-03
2022-01-04
2022-01-05
2022-01-06
2022-01-07
2022-01-08
2022-01-09
2022-01-10
2022-01-11
2022-01-12
2022-01-13
2022-01-14
2022-01-15
2022-01-16
2022-01-17
2022-01-18
2022-01-19
2022-01-20
2022-01-21
2022-01-22
2022-01-23
2022-01-24
2022-01-25
2022-01-26
2022-01-27
2022-01-28
2022-01-29
2022-01-30
2022-01-31
2022-02-01
2022-02-02
2022-02-03
2022-02-04
2022-02-05
2022-02-06
2022-02-07
2022-02-08
2022-02-09
2022-02-10
2022-02-11
2022-02-12
2022-02-13
2022-02-14
2022-02-15
2022-02-16
2022-02-17
2022-02-18
2022-02-19
2022-02-20
2022-02-21
2022-02-22
2022-02-23
2022-02-24
2022-02-25
2022-02-26
2022-02-27
2022-02-28
2022-03-01
2022-03-02
2022-03-03
2022-03-04
2022-03-05
2022-03-06
2022-03-07
2022-03-08
2022-03-09
2022-03-10
2022-03-11
2022-03-12
2022-03-13
2022-03-14
2022-03-15
2022-03-16
2022-03-17
2022-03-18
2022-03-19
2022-03-20
2022-03-21
2022-03-22
2022-03-23
2022-03-24
2022-03-25
2022-03-26
2022-03-27
2022-03-28
2022-03-29
2022-03-30
2022-03-31
2022-04-01

In [5]:
display(summarized_trajectory_df)

Unnamed: 0,flight_id,latitude_count,latitude_mean,latitude_std,latitude_min,latitude_25percentile,latitude_median,latitude_75percentile,latitude_max,longitude_count,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
0,248750611,13271,42.242653,6.606323,29.302139,36.716937,42.405441,48.180670,52.470612,13271,...,213.422879,284.986510,13271,0.000142,0.000696,-2.844416e-08,0.000025,0.000028,0.000029,0.008131
1,248750618,2903,40.353482,0.473968,39.555038,39.932671,40.285748,40.908052,40.998184,2903,...,274.292577,280.171640,2903,0.001773,0.002111,2.668795e-05,0.000032,0.000193,0.004154,0.005552
2,248750628,3536,38.204628,2.110900,34.954651,36.285004,38.301727,40.195255,41.255245,3536,...,238.415371,280.879933,3536,0.000727,0.001452,4.413493e-05,0.000052,0.000065,0.000411,0.005262
3,248750632,4881,36.297793,3.185204,31.462463,33.512677,36.075439,39.276901,41.256269,4881,...,225.414225,280.904259,4881,0.000455,0.001163,6.783489e-06,0.000020,0.000048,0.000084,0.005257
4,248750636,891,39.973789,0.197517,39.605666,39.788626,39.977692,40.156100,40.156100,891,...,259.442577,276.882732,891,0.001077,0.001332,2.064300e-05,0.000058,0.000322,0.001916,0.004461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979675,258074471,10994,48.924019,4.449353,41.180466,45.049842,49.094398,52.838858,55.774283,10994,...,223.355169,282.302024,10994,0.000611,0.001518,3.065331e-06,0.000031,0.000042,0.000074,0.006643
979676,258074475,3371,40.920038,0.445340,40.248273,40.500275,40.837830,41.374466,41.596938,3371,...,272.328340,281.587474,3371,0.001327,0.002055,1.629996e-05,0.000031,0.000057,0.002804,0.005551
979677,258074485,2418,47.003726,0.433726,46.241409,46.580163,47.118530,47.448704,47.462893,2418,...,288.561217,288.884297,2418,0.004061,0.002374,3.126592e-05,0.000985,0.004625,0.006626,0.007395
979678,258074488,4273,48.092001,0.666757,47.401816,47.448509,47.856354,48.879151,49.064438,4273,...,279.127652,288.761667,4273,0.002412,0.002576,1.895130e-05,0.000099,0.000751,0.004469,0.007492


In [6]:
rows_with_nan = summarized_trajectory_df[summarized_trajectory_df.isna().any(axis=1)]
display(rows_with_nan)

Unnamed: 0,flight_id,latitude_count,latitude_mean,latitude_std,latitude_min,latitude_25percentile,latitude_median,latitude_75percentile,latitude_max,longitude_count,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
2337,248771793,1021,59.651374,0.000000,59.651374,59.651374,59.651374,59.651374,59.651374,1021,...,273.713813,274.416775,1021,0.001659,1.678566e-03,0.000082,0.000367,0.000592,0.003851,0.004626
45214,249175771,18,40.121091,0.000871,40.120182,40.120182,40.121376,40.121849,40.122059,18,...,266.543000,266.543286,18,0.001010,9.252112e-04,0.000005,0.000005,0.001814,0.001814,0.001815
51379,249242347,1146,40.127953,0.003660,40.117159,40.129303,40.129303,40.129303,40.129303,1146,...,256.416343,267.296896,1146,0.000378,5.281882e-04,0.000011,0.000051,0.000105,0.000442,0.001991
59968,249326472,31,45.625500,0.000000,45.625500,45.625500,45.625500,45.625500,45.625500,31,...,282.038968,282.060280,31,0.004665,4.153567e-06,0.004660,0.004660,0.004668,0.004669,0.004670
72817,249452195,2811,61.363069,1.160289,59.662512,60.281570,61.349146,62.404839,63.220220,2811,...,253.671857,273.135124,2811,0.000461,7.919698e-04,0.000010,0.000010,0.000016,0.000600,0.003172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948348,257758908,800,39.573441,0.000000,39.573441,39.573441,39.573441,39.573441,39.573441,800,...,249.655232,249.658309,800,0.000334,1.211739e-06,0.000332,0.000333,0.000334,0.000335,0.000336
961421,257879807,280,53.423662,0.000000,53.423662,53.423662,53.423662,53.423662,53.423662,280,...,266.497343,266.498976,280,0.001196,5.555397e-06,0.001187,0.001192,0.001196,0.001201,0.001206
969592,257971243,15,63.994772,0.000011,63.994766,63.994766,63.994766,63.994773,63.994803,15,...,271.578576,271.581487,15,0.002938,2.709342e-07,0.002938,0.002938,0.002938,0.002939,0.002939
970044,257975953,447,53.362735,0.002919,53.356185,53.361797,53.362850,53.364544,53.367253,447,...,280.983094,281.031865,447,0.005685,3.801653e-04,0.000024,0.005699,0.005709,0.005721,0.005734


In [7]:
summarized_trajectory_df.to_csv("./data/summarized_trajectory.csv", index=False)
print(f"./data/summarized_trajectory.csv is saved!")

./data/summarized_trajectory.csv is saved!
