In [1]:
import polars as pl
import pandas as pd
import geopandas as gpd
import contextily as cx
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.core.pylabtools import figsize

In [2]:
cities = sorted([c.lower() for c in ["Bern", "Zurich", "Innsbruck", "Hamburg", "Bordeaux"]])

q = pl.scan_csv("./data/detectors_public.csv", schema_overrides={"limit": pl.Utf8}).filter(
    pl.col("citycode").is_in(cities))

detector_df = q.collect()

In [3]:
pdf = detector_df.to_pandas()

pdf["lat"] = pd.to_numeric(pdf["lat"], errors="coerce")
pdf["long"] = pd.to_numeric(pdf["long"], errors="coerce")

gdf = gpd.GeoDataFrame(
    pdf,
    geometry=gpd.points_from_xy(pdf["long"], pdf["lat"]),
    crs="EPSG:4326"
)

In [4]:
for city in cities:
    mask = gdf['citycode'] == city
    city_df = gdf[mask]
    ax = city_df.plot(figsize=(12, 7), markersize=5, column='fclass', legend=True, categorical=True)
    cx.add_basemap(ax, crs=gdf.crs)
    plt.savefig(f'./plots/maps/{city}_detector_locations.png', dpi=300)
    plt.close()

In [5]:
q = (
    pl.scan_csv(
        "./data/utd19_u.csv",
        schema_overrides={
            "day": pl.Date,
            "flow": pl.Float64,
            "interval": pl.Int64,
            "error": pl.Float64,
            "speed": pl.Float64,
        },
    )
    .filter(pl.col("city").is_in(cities))
    .with_columns(
        (
                pl.col("day").cast(pl.Datetime)
                + pl.duration(seconds=pl.col("interval"))
        ).alias("timestamp")
    )
)

utd_df = q.collect()

In [6]:
total_df = utd_df.join(detector_df, on="detid", how="inner")

In [15]:
for city in cities:
    city_df = total_df.filter(pl.col("city") == city)
    flow_series = city_df["flow"].drop_nulls()
    if flow_series.len() == 0:
        continue

    q5 = flow_series.quantile(0.05)
    q95 = flow_series.quantile(0.95)
    clipped = city_df.filter((q5 < pl.col("flow")) & (pl.col("flow") < q95))

    data = clipped["flow"].drop_nulls().to_numpy()
    plt.figure()
    plt.hist(data)
    plt.title(f"{city.capitalize()} Flow Distribution")
    plt.xlabel("Flow [veh/h]")
    plt.tight_layout()
    plt.savefig(f'./plots/histograms/{city}_flow_histogram.png', dpi=300)
    plt.close()

In [16]:
for city in cities:
    city_df = total_df.filter(pl.col("city") == city)
    occ_series = city_df["occ"].drop_nulls()
    if occ_series.len() == 0:
        continue

    q5 = occ_series.quantile(0.05)
    q95 = occ_series.quantile(0.95)
    clipped = city_df.filter((q5 < pl.col("occ")) & (pl.col("occ") < q95))

    data = clipped["occ"].drop_nulls().to_numpy()
    plt.figure()
    plt.hist(data)
    plt.title(f"{city.capitalize()} Occupancy Distribution")
    plt.xlabel("Occupancy [%]")
    plt.tight_layout()
    plt.savefig(f'./plots/histograms/{city}_occupancy_histogram.png', dpi=300)
    plt.close()

In [19]:
for city in cities:
    city_df = total_df.filter(pl.col("city") == city)
    speed_series = city_df["speed"].drop_nulls()
    if speed_series.len() == 0:
        continue
    q5 = speed_series.quantile(0.05)
    q95 = speed_series.quantile(0.95)
    clipped = city_df.filter((q5 < pl.col("speed")) & (pl.col("speed") < q95))

    data = clipped["speed"].drop_nulls().to_numpy()
    plt.figure()
    plt.hist(data)
    plt.title(f"{city.capitalize()} Speed Distribution")
    plt.xlabel("Speed [km/h]")
    plt.tight_layout()
    plt.savefig(f'./plots/histograms/{city}_speed_histogram.png', dpi=300)
    plt.close()

In [58]:
for city in cities:
    city_df = total_df.filter(pl.col("city") == city)
    summary_df = city_df[["flow", "occ", "speed"]]
    summary = summary_df.describe()
    psummary = summary.to_pandas()
    print(psummary.to_latex(index=False, na_rep='--', float_format="%.2f", bold_rows=True, column_format="r|lll", caption=f"Summary Table for {city.capitalize()}", label=f"tab:summary-{city}"))

\begin{table}
\caption{Summary Table for Bern}
\label{tab:summary-bern}
\begin{tabular}{r|lll}
\toprule
statistic & flow & occ & speed \\
\midrule
count & 1526032.00 & 1526032.00 & 0.00 \\
null_count & 0.00 & 0.00 & 1526032.00 \\
mean & 151.87 & 0.12 & -- \\
std & 646.93 & 0.18 & -- \\
min & 0.00 & 0.00 & -- \\
25% & 24.00 & 0.01 & -- \\
50% & 84.00 & 0.04 & -- \\
75% & 203.80 & 0.16 & -- \\
max & 59270.80 & 1.25 & -- \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}
\caption{Summary Table for Bordeaux}
\label{tab:summary-bordeaux}
\begin{tabular}{r|lll}
\toprule
statistic & flow & occ & speed \\
\midrule
count & 844416.00 & 695520.00 & 0.00 \\
null_count & 0.00 & 148896.00 & 844416.00 \\
mean & 305.77 & 0.08 & -- \\
std & 322.06 & 0.14 & -- \\
min & 0.00 & 0.00 & -- \\
25% & 60.00 & 0.01 & -- \\
50% & 216.00 & 0.03 & -- \\
75% & 456.00 & 0.08 & -- \\
max & 23016.00 & 1.00 & -- \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}
\caption{Summary Table for Hamburg}
\label{tab:

In [54]:
for city

TypeError: 'GroupBy' object is not subscriptable