<h1>Read & Summarize Trajactory Files.ipynb</h1>

In [1]:
import pyarrow.parquet as pq
import pandas as pd
import os

In [2]:
challenge_set = pd.read_csv('../challenge_set.csv')
flight_ids = list(challenge_set['flight_id'].astype(int).unique())

submission_set = pd.read_csv('../submission_set.csv')
flight_ids += list(submission_set['flight_id'].astype(int).unique())

print(f"{len(flight_ids) = } flights")

len(flight_ids) = 474972 flights


In [3]:
file_names = os.listdir("../data/trajectory_files")

trajectory_df = pd.DataFrame({})

for trajectory_file_name in file_names:
    if ".parquet" in trajectory_file_name:
        current_trajectory_df = pq.ParquetDataset(f'../data/trajectory_files/{trajectory_file_name}').read().to_pandas()
        filtered_trajactory_df = current_trajectory_df[current_trajectory_df['flight_id'].astype(int).isin(flight_ids)]
        trajectory_df = pd.concat([trajectory_df, filtered_trajactory_df], ignore_index=True)

# Display the filtered DataFrame
print(f"{len(trajectory_df) = }")
display(trajectory_df)

MemoryError: Unable to allocate 18.5 GiB for an array with shape (11, 225892937) and data type float64

In [None]:
def calculate_stat(
    dataframe: pd.DataFrame, 
    group_by_column: str, 
    target_column: str,
    stat_type: str,
    value_type: str = 'float64'
) -> pd.Series | None:
    result = None
    if stat_type == 'median':
        result = df.groupby(group_by_column).apply(lambda group: group[target_column].median(), meta=('value', value_type)).compute()
    elif stat_type == 'mean':
        result = df.groupby(group_by_column).apply(lambda group: group[target_column].mean(), meta=('value', value_type)).compute()
    return result

In [None]:
latitude_median = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='latitude', stat_type='median')
longitude_median = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='longitude', stat_type='median')
altitude_mean = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='altitude', stat_type='mean', value_type='int64')
groundspeed_mean = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='groundspeed', stat_type='mean', value_type='int64')
track_mean = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='track', stat_type='mean')
vertical_rate_mean = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='vertical_rate', stat_type='mean', value_type='int64')
track_unwrapped_mean = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='track_unwrapped', stat_type='mean')
u_component_of_wind_mean = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='u_component_of_wind', stat_type='mean')
v_component_of_wind_mean = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='v_component_of_wind', stat_type='mean')
temperature_mean = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='temperature', stat_type='mean')
specific_humidity_mean = calculate_stat(dataframe=trajectory_df, group_by_column='flight_id', target_column='specific_humidity', stat_type='mean')

trajactory_summary_df = pd.DataFrame({
    "latitude_median": latitude_median,
    "longitude_median": longitude_median,
    "altitude_mean": altitude_mean,
    "groundspeed_mean": groundspeed_mean,
    "track_mean": track_mean,
    "vertical_rate_mean": vertical_rate_mean,
    "track_unwrapped_mean": track_unwrapped_mean,
    "u_component_of_wind_mean": u_component_of_wind_mean,
    "v_component_of_wind_mean": v_component_of_wind_mean,
    "temperature": temperature_mean,
    "specific_humidity": specific_humidity_mean
})

display(trajactory_summary_df)

In [None]:
def save_dataframe(path: str, dataframe: pd.DataFrame) -> None:
    dataframe.to_csv(path, index=False)
    print(f"{path} is saved!")

In [None]:
save_dataframe(path="../data/trajactory_summary.csv", dataframe=trajactory_summary_df)