In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
if "snakemake" in locals():
    input_path = snakemake.input[0]
    output_path = snakemake.output[0]
else:
    input_path = "../../../../resources/brussels/airport/group_size_surveys_TML.csv"
    output_path = "../../../../results/brussels/airport/group_sizes.parquet"

In [None]:
# Load data
df = pd.read_csv(input_path)

In [None]:
# Restructure data
df = df.melt(["travel_company_total"], ["2019", "2022", "2023"], "year", "count")
df = df.rename(columns = { "travel_company_total": "group_size" })

In [None]:
# Maximum group size set to 6
df["group_size"] = np.minimum(6, df["group_size"])
df = df.groupby(["group_size", "year"])["count"].sum().reset_index()

In [None]:
# Calculate shares
df_total = df.groupby("year")["count"].sum().reset_index(name = "total")
df = pd.merge(df, df_total, on = "year")
df["weight"] = df["count"] / df["total"]

In [None]:
px.bar(
    df, x = "group_size", y = "weight", color = "year", barmode = "group",
    title = "Distribution of group size in the interviews by year"
)

In [None]:
# Select distribution for year 2023
df_output = df[df["year"] == "2023"][["group_size", "weight"]].copy()
df_output.to_parquet(output_path)
df_output