In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter


def load_parquet(file_path: str) -> pd.DataFrame:
    """
    Load a Parquet file into a pandas DataFrame.

    Parameters:
    file_path (str): The path to the Parquet file.

    Returns:
    pd.DataFrame: The loaded DataFrame.
    """
    try:
        df = pd.read_parquet(file_path)
        return df
    except Exception as e:
        print(f"An error occurred while loading the Parquet file: {e}")

In [None]:
df = load_parquet("data/flights_train/prc770847190.parquet")

print(df.columns)

# look for this flight_id in flightlist_train.parquet
flightlist_df = load_parquet("data/flightlist_train.parquet")
matching_flights = flightlist_df[flightlist_df["flight_id"] == df["flight_id"].values[0]]
origin_icao = matching_flights["origin_icao"].values[0]
destination_icao = matching_flights["destination_icao"].values[0]
print(f"Origin ICAO: {origin_icao}, Destination ICAO: {destination_icao}")

# look for the origin and destination in apt.parquet
apt_df = load_parquet("data/apt.parquet")
origin_info = apt_df[apt_df["icao"] == origin_icao]
destination_info = apt_df[apt_df["icao"] == destination_icao]
print(f"Origin Info:\n{origin_info}, Destination Info:\n{destination_info}")

# plot all "latitude", "longitude" pairs
plt.scatter(df["longitude"], df["latitude"], alpha=0.5, s=10)
plt.scatter(origin_info["longitude"], origin_info["latitude"], color="green", label="Origin", s=50)
plt.scatter(destination_info["longitude"], destination_info["latitude"], color="red", label="Destination", s=50)
plt.legend()
plt.title("Flight Paths")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.axis("equal")
plt.show()

In [None]:
# among all the files in data/flights_train, identify the ones that have a flight_id that is "prc801785556"
# load the files in the order they appear in
import os


all_files = os.listdir("data/flights_train")
list_of_similar_flights = []
for i in range(5879):
    file_path = os.path.join("data/flights_train", all_files[i])
    temp_df = load_parquet(file_path)
    # check for flights that have the same origin and destination as the current df
    try:
        temp_matching_flights = flightlist_df[flightlist_df["flight_id"] == temp_df["flight_id"].values[0]]
    except Exception:
        continue
    # temp_matching_flights = flightlist_df[flightlist_df["flight_id"] == temp_df["flight_id"].values[0]]
    if temp_matching_flights.empty:
        continue
    temp_origin_icao = temp_matching_flights["origin_icao"].values[0]
    temp_destination_icao = temp_matching_flights["destination_icao"].values[0]
    if temp_origin_icao == origin_icao and temp_destination_icao == destination_icao:
        list_of_similar_flights.append(temp_df["flight_id"].values[0])

print(f"Similar flights with same origin and destination: {list_of_similar_flights}")

In [None]:
print(len(list_of_similar_flights))

In [None]:
# iterate over all files in data/flights_train and create a dictiionary with (origin_icao, destination_icao) as key and list of flight_ids as value
# do this in a smart way to avoid loading the same flightlist_train.parquet file multiple times
from tqdm import trange


flight_dict = {}
for i in trange(5879):
    file_path = os.path.join("data/flights_train", all_files[i])
    temp_df = load_parquet(file_path)
    try:
        temp_matching_flights = flightlist_df[flightlist_df["flight_id"] == temp_df["flight_id"].values[0]]
    except Exception:
        continue
    if temp_matching_flights.empty:
        continue
    temp_origin_icao = temp_matching_flights["origin_icao"].values[0]
    temp_destination_icao = temp_matching_flights["destination_icao"].values[0]
    key = (temp_origin_icao, temp_destination_icao)
    if key not in flight_dict:
        flight_dict[key] = []
    flight_dict[key].append(temp_df["flight_id"].values[0])

print(f"Flight dictionary keys: {list(flight_dict.keys())}")

In [None]:
# print sum of values in flight_dict
total_flights = sum(len(v) for v in flight_dict.values())
print(f"Total number of flights in flight_dict: {total_flights}")

# print number of unique origin-destination pairs
print(f"Number of unique origin-destination pairs: {len(flight_dict)}")

# print number of icao codes in apt.parquet and compare with number of unique origin and destination icao codes in flightlist_train.parquet
apt_df = load_parquet("data/apt.parquet")
unique_icao_codes_in_apt = set(apt_df["icao"].values)
print(f"Number of ICAO codes in apt.parquet: {len(unique_icao_codes_in_apt)}")
unique_origin_icao_codes = set(flightlist_df["origin_icao"].values)
unique_destination_icao_codes = set(flightlist_df["destination_icao"].values)
unique_icao_codes_in_flightlist = unique_origin_icao_codes.union(unique_destination_icao_codes)
print(f"Number of unique ICAO codes in flightlist_train.parquet: {len(unique_icao_codes_in_flightlist)}")

In [None]:
for flight_id in list_of_similar_flights:
    # gemini generated code
    import matplotlib.pyplot as plt
    import contextily as ctx
    
    df = load_parquet(f"data/flights_train/{flight_id}.parquet")
    fig, ax = plt.subplots(figsize=(10, 10))

    ax.scatter(df["longitude"], df["latitude"], alpha=0.5, s=10, label="Flight Path")
    ax.scatter(origin_info["longitude"], origin_info["latitude"], color="green", label="Origin", s=50, zorder=5)
    ax.scatter(destination_info["longitude"], destination_info["latitude"], color="red", label="Destination", s=50, zorder=5)

    ctx.add_basemap(ax, crs="EPSG:4326", source=ctx.providers.OpenStreetMap.Mapnik)

    ax.legend()
    ax.set_title("Flight Paths")
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    ax.set_aspect('equal', adjustable='datalim')

    plt.show()

In [None]:


# plot time series for numeric features (excluding timestamp itself)
if "timestamp" in df.columns:
    # ensure timestamp is datetime for better x-axis formatting
    try:
        df = df.copy()
        df["timestamp"] = pd.to_datetime(df["timestamp"])
    except Exception:
        pass

x = df["timestamp"]
xlabel = "Time"
for feature in ["altitude", "groundspeed", "track", "vertical_rate"]:
    plt.figure(figsize=(10, 3))
    plt.scatter(x, df[feature], alpha=0.5, s=10)
    plt.title(f"{feature} as a function of time")
    plt.xlabel(xlabel)
    plt.xlim(x.min()-pd.Timedelta(minutes=5), x.max()+pd.Timedelta(minutes=5))
    plt.ylabel(feature)
    plt.gca().xaxis.set_major_formatter(DateFormatter("%H:%M"))
    plt.tight_layout()
    plt.show()



In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)
profile.to_file("parquet_data_profile.html")
