In [None]:
import pandas as pd

alerts_df = pd.read_parquet("../../data/nps/nps_public_data_alerts.parquet")

Given Python's differences from SQL, `window` functions are mostly used for rolling averages, something reflected in the Pandas [documentation](https://pandas.pydata.org/docs/user_guide/window.html) that we'll demonstrate here.

pandas supports 4 types of windowing operations:
- Rolling window: Generic fixed or variable sliding window over the values.
- Weighted window: Weighted, non-rectangular window supplied by the scipy.signal library.
- Expanding window: Accumulating window over the values.
- Exponentially Weighted window: Accumulating and exponentially weighted window over the values.

We'll focus on rolling + expanding, since those are the most applicable for transformation

In [None]:
alerts_df["alert_date"] = pd.to_datetime(alerts_df["lastIndexedDate"])
alerts_df.head()

We can pick up where we left off in the last lesson:

In [None]:
alerts_by_category = (
    alerts_df.groupby(["alert_date", "category"])["description"]
    .count()
    .reset_index()
    .sort_values("alert_date")
)

alerts_by_category["alert_date"] = pd.to_datetime(
    alerts_by_category["alert_date"], format="%Y-%m-%d"
)

alerts_by_category.set_index("alert_date", inplace=True)

alerts_by_category_filled = pd.DataFrame(columns=alerts_by_category.columns)

for category in alerts_by_category["category"].unique():
    if len(category) > 0:
        new = alerts_by_category[alerts_by_category["category"] == category][
            "description"
        ]

        resampled = pd.DataFrame(new.resample("1D").asfreq().fillna(0)).reset_index()
        resampled["category"] = category

        alerts_by_category_filled = pd.concat([alerts_by_category_filled, resampled])

alerts_by_category_filled.sort_values(by="alert_date").tail()

In [None]:
alerts_df["alert_date"] = pd.to_datetime(alerts_df["lastIndexedDate"])

num_alerts = (
    alerts_df.set_index("alert_date")
    .groupby([pd.Grouper(freq="1D"), "category"])["description"]
    .count()
)

num_alerts_unstacked = (
    num_alerts.unstack()
    .resample("1D")
    .asfreq()[["Caution", "Danger", "Information", "Park Closure"]]
    .fillna(0)
)

num_alerts_unstacked.tail()

If you're in analytics, you're likely aware that _rolling counts or averages_ can be incredibly valuable for monitoring trends over time. Pandas makes this quite easy. It will be helpful to first set the `alert_date` as our index.

In [None]:
# get rolling 7-day
rolling_alerts_7 = num_alerts_unstacked.rolling(window=7).sum().reset_index()
rolling_alerts_7

It can also be useful to get multiple rolling averages to compare trends

In [None]:
rolling_alerts_14 = (
    num_alerts_unstacked.rolling(window=14)
    .sum()
    .reset_index()
    .rename(columns={"num_alerts": "rolling_14"})
)
rolling_alerts_28 = (
    num_alerts_unstacked.rolling(window=28)
    .sum()
    .reset_index()
    .rename(columns={"num_alerts": "rolling_28"})
)

It's outside this course, but whether you're an analyst or engineer, visualization of results is important for a gut-check! You can do this easily with plotly.

In [None]:
import plotly.express as px


plot_cols = ["rolling_7", "rolling_14", "rolling_28"]

categories = ["Information", "Park Closure", "Caution"]

for category in categories:

    plot_df = (
        rolling_alerts_7[["alert_date", category]]
        .merge(
            rolling_alerts_14[["alert_date", category]],
            on="alert_date",
            suffixes=("_7", "_14"),
        )
        .merge(rolling_alerts_28[["alert_date", category]], on="alert_date")
        .rename(columns={category: f"{category}_28"})
    )

    plot_df = plot_df[plot_df["alert_date"] > "2022-01-01"]

    fig = px.line(
        data_frame=plot_df,
        x="alert_date",
        y=[f"{category}_7", f"{category}_14", f"{category}_28"],
        title=f"Rolling '{category}' alerts",
    )

    fig.show()

Finally, if we were more interested in accumulating windows:

In [None]:
cumulative_alerts = (
    num_alerts_unstacked.expanding()
    .sum()
    .reset_index()
    .rename(columns={"num_alerts": "cumulative"})
)

cumulative_alerts

In [None]:
import plotly.express as px

categories = ["Information", "Park Closure", "Caution"]

fig = px.line(
    data_frame=cumulative_alerts,
    x="alert_date",
    y=["Caution", "Danger", "Information", "Park Closure"],
    title="Cumulative Alerts",
)

fig.show()