<h1>Summarize Trajectories</h1>

In [1]:
from datetime import datetime, timedelta
from datetime import timedelta
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

In [2]:
def calculate_stat(
    dataframe: pd.DataFrame, 
    group_by_column: str, 
    target_column: str,
    stat_type: str
) -> pd.Series | None:
    group = dataframe.groupby(group_by_column)[target_column]
    if stat_type == 'count':
        return group.size()
    elif stat_type == 'mean':
        return group.mean()
    elif stat_type == 'std':
        return group.std()
    elif stat_type == 'min':
        return group.min()
    elif stat_type == '25percentile':
        return group.quantile(0.25)
    elif stat_type == 'median':
        return group.median()
    elif stat_type == '75percentile':
        return group.quantile(0.75)
    elif stat_type == 'max':
        return group.max()
    return None

In [3]:
def summarize_trajectories(
    trajectory_df: pd.DataFrame,
    columns: list = ['latitude', 'longitude', 'altitude', 'groundspeed', 'track', 'vertical_rate', 'track_unwrapped', 
                     'u_component_of_wind', 'v_component_of_wind', 'temperature', 'specific_humidity'],
    stat_types: list = ['count', 'mean', 'std', 'min', '25percentile', 'median', '75percentile', 'max']
) -> pd.DataFrame:
    stats_dict = {}
    for column in columns:
        for stat_type in stat_types:
            stat = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column=column, stat_type=stat_type)
            stats_dict[f"{column}_{stat_type}"] = stat
    trajactory_summary_df = pd.DataFrame(stats_dict)
    trajactory_summary_df = trajactory_summary_df.reset_index()
    trajactory_summary_df.rename(columns={'index': 'flight_id'}, inplace=True)
    trajactory_summary_df['flight_id'] = trajactory_summary_df['flight_id'].astype('int64')
    return trajactory_summary_df

In [None]:
summarized_trajectory_df = pd.DataFrame({})

flight_count = 0
start_date = datetime(2022, 1, 1)
end_date = datetime(2022, 12, 31)

current_date = start_date
while current_date <= end_date:   
    dfs =  []
    today = current_date.strftime('%Y-%m-%d')
    print(today)
    today_trajectory_df = pq.ParquetDataset(f'../../data/trajectory_files/{today}.parquet').read().to_pandas()
    flight_count += len(today_trajectory_df['flight_id'].unique())
    dfs.append(today_trajectory_df)
    
    yesterday = current_date - timedelta(days=1)
    if yesterday.year == 2022:
        yesterday_date = yesterday.strftime('%Y-%m-%d')
        yesterday_trajectory_df = pq.ParquetDataset(f'../../data/trajectory_files/{yesterday_date}.parquet').read().to_pandas()
        filtered_yesterday_trajectory_df = yesterday_trajectory_df[yesterday_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_yesterday_trajectory_df)
        
    tomorrow = current_date + timedelta(days=1)
    if tomorrow.year == 2022:
        tomorrow_date = tomorrow.strftime('%Y-%m-%d')
        tomorrow_trajectory_df = pq.ParquetDataset(f'../../data/trajectory_files/{tomorrow_date}.parquet').read().to_pandas()
        filtered_tomorrow_trajectory_df = tomorrow_trajectory_df[tomorrow_trajectory_df['flight_id'].isin(today_trajectory_df['flight_id'])]
        dfs.append(filtered_tomorrow_trajectory_df)
        
    trajectory_df = pd.concat(dfs, axis=0, ignore_index=True)
    sum_trajectory_df = summarize_trajectories(trajectory_df)
    summarized_trajectory_df = pd.concat([summarized_trajectory_df, sum_trajectory_df], ignore_index=True)

    current_date += timedelta(days=1)

print(f"{flight_count = }")
print(f"{len(summarized_trajectory_df) = }")
num_unique_flight_ids = len(summarized_trajectory_df['flight_id'].unique())
print(f"{num_unique_flight_ids = }")

2022-01-01
2022-01-02
2022-01-03
2022-01-04
2022-01-05
2022-01-06
2022-01-07
2022-01-08
2022-01-09
2022-01-10
2022-01-11
2022-01-12
2022-01-13
2022-01-14
2022-01-15
2022-01-16
2022-01-17
2022-01-18
2022-01-19
2022-01-20
2022-01-21
2022-01-22
2022-01-23
2022-01-24
2022-01-25
2022-01-26
2022-01-27
2022-01-28
2022-01-29
2022-01-30
2022-01-31
2022-02-01
2022-02-02
2022-02-03
2022-02-04
2022-02-05
2022-02-06
2022-02-07
2022-02-08
2022-02-09
2022-02-10
2022-02-11
2022-02-12
2022-02-13
2022-02-14
2022-02-15
2022-02-16
2022-02-17
2022-02-18
2022-02-19
2022-02-20
2022-02-21
2022-02-22
2022-02-23
2022-02-24
2022-02-25
2022-02-26
2022-02-27
2022-02-28
2022-03-01
2022-03-02
2022-03-03
2022-03-04
2022-03-05
2022-03-06
2022-03-07
2022-03-08
2022-03-09
2022-03-10
2022-03-11
2022-03-12
2022-03-13
2022-03-14
2022-03-15
2022-03-16
2022-03-17
2022-03-18
2022-03-19
2022-03-20
2022-03-21
2022-03-22
2022-03-23
2022-03-24
2022-03-25
2022-03-26
2022-03-27
2022-03-28
2022-03-29
2022-03-30
2022-03-31
2022-04-01

In [None]:
display(summarized_trajectory_df)

In [None]:
rows_with_nan = summarized_trajectory_df[summarized_trajectory_df.isna().any(axis=1)]
display(rows_with_nan)

In [None]:
summarized_trajectory_df.to_csv("summarized_trajectory.csv", index=False)
print(f"summarized_trajectory.csv is saved!")