In [None]:
%load_ext lab_black
import json
from pathlib import Path

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import ticker as tck
import seaborn as sns
from wikipedia_cleanup.schema import InfoboxRevision
from typing import Iterable, Tuple, Any, Optional
from ipywidgets import (
    interact,
    IntSlider,
    IntRangeSlider,
    FloatRangeSlider,
    Checkbox,
    ToggleButtons,
)

In [None]:
sns.set_theme(style="ticks")
cm = 1 / 2.54
a4 = 29.7, 42

In [None]:
def get_data(n_files: Optional[int] = None) -> Iterable[Tuple[Any, ...]]:
    files = [
        x
        for x in sorted(Path("../matched-infoboxes-extracted/").rglob("*.output.json"))
        if x.is_file()
    ]
    for file in files[slice(n_files)]:
        with open(file) as f:
            for line in f:
                revision = InfoboxRevision(**json.loads(line))
                for change in revision.changes:
                    yield (
                        revision.key,
                        revision.revisionId,
                        revision.pageID,
                        revision.pageTitle,
                        change.property.name,
                        change.previousValue,
                        change.currentValue,
                        revision.validFrom,
                    )

In [None]:
df = pd.DataFrame(
    get_data(n_files=5),
    columns=[
        "key",
        "revisionId",
        "pageID",
        "pageTitle",
        "property.name",
        "previousValue",
        "currentValue",
        "validFrom",
    ],
)
df["key"] = df["key"].astype(str)
df["revisionId"] = df["revisionId"].astype(int)
df["pageID"] = df["pageID"].astype(int)
df["pageTitle"] = df["pageTitle"].astype(str)
df["property.name"] = df["property.name"].astype(str)
df["previousValue"] = df["previousValue"].astype(str)
df["currentValue"] = df["currentValue"].astype(str)
df = df.set_index(["revisionId", "key", "property.name"]).sort_index()
df

In [None]:
@interact(
    bins=IntSlider(50, 1, 100, 1, description="Bins", continuous_update=False),
    kde=Checkbox(True, description="KDE"),
)
def plot_hist1(bins, kde) -> None:
    data = df.reset_index().groupby("pageID")[["property.name"]].count()
    fig, ax = plt.subplots(figsize=(a4[0] * cm, (a4[0] / 2) * cm), dpi=300)
    sns.histplot(
        data=data,
        stat="percent",
        bins=bins,
        kde=kde,
        log_scale=True,
        legend=False,
        ax=ax,
    )
    ax.set_xlim(left=1)
    ax.yaxis.set_major_formatter(tck.PercentFormatter())
    plt.title("Number of Properties per Page")
    plt.xlabel("Number of Properties")
    plt.ylabel("Percentage of Pages")
    sns.despine(ax=ax)
    plt.show()

In [None]:
@interact(
    bins=IntSlider(50, 1, 100, 1, description="Bins", continuous_update=False),
    kde=Checkbox(True, description="KDE"),
)
def plot_hist2(bins, kde) -> None:
    data = df.reset_index().groupby("property.name")[["pageID"]].count()
    data["pageID"] = 100 * data["pageID"] / data["pageID"].sum()
    fig, ax = plt.subplots(figsize=(a4[0] * cm, (a4[0] / 2) * cm), dpi=300)
    sns.histplot(
        data=data,
        stat="percent",
        bins=bins,
        log_scale=True,
        kde=kde,
        legend=False,
        ax=ax,
    )
    ax.xaxis.set_major_formatter(tck.PercentFormatter(decimals=4))
    ax.yaxis.set_major_formatter(tck.PercentFormatter())
    plt.title("Prevalence of Properties")
    plt.xlabel("Percentage of Pages")
    plt.ylabel("Percentage of Properties")
    sns.despine(ax=ax)
    plt.show()

In [None]:
@interact(kind=ToggleButtons(options=["Pie", "Bar"], description="Kind"))
def plot_hist3(kind) -> None:
    k = kind.lower()
    fig, ax = plt.subplots(figsize=(a4[0] * cm, (a4[0] / 2) * cm), dpi=100)
    pd.DataFrame(
        [
            (df["previousValue"].isna()).sum() / len(df["previousValue"]),
            (df["currentValue"].isna()).sum() / len(df["currentValue"]),
            (df["previousValue"].notna() & df["currentValue"].notna()).sum()
            / len(df["currentValue"]),
        ],
        index=["Creation", "Deletion", "Modification"],
        columns=["Percentage"],
    ).sort_values("Percentage", ascending=False).plot(
        y="Percentage", kind=k, ax=ax, legend=False, rot=0
    )
    ax.yaxis.set_major_formatter(tck.PercentFormatter(xmax=1))
    sns.despine(ax=ax)
    plt.title("Changes by Type")
    plt.xlabel("Type of Change")
    plt.ylabel("Percentage of Changes")
    plt.show()

# MVP Model

Predicts the mean days per change

In [None]:
df2 = (
    df.reset_index()
    .set_index(["pageID", "property.name", "validFrom"])
    .sort_values(["revisionId", "key", "property.name"])
    .groupby(["pageID", "property.name", pd.Grouper(level=-1, freq="D")], sort=False)
    .count()[["currentValue"]]
    .rename(columns={"currentValue": "Changes"})
    .sort_index()
)
df2 = (
    df2.groupby(level=[0, 1], sort=False)
    .cumsum()
    .rename(columns={"Changes": "Cumulative Changes"})
)
df2 = df2.reset_index(level=-1).join(
    df2.reset_index(level=-1)
    .groupby(level=[0, 1], sort=False)[["validFrom"]]
    .first()
    .rename(columns={"validFrom": "First Change"})
)

df2["Days"] = (
    (df2["validFrom"] - df2["First Change"]) / np.timedelta64(1, "D")
).astype(int)
df2["E"] = df2["Days"] / df2["Cumulative Changes"]

df2["Next Change"] = df2["validFrom"]
df2.set_index("validFrom", append=True, inplace=True)
df2["Next Change"] = df2["Next Change"].groupby(level=[0, 1], sort=False).shift(-1)
df2["y"] = (
    df2["Next Change"].values - df2.reset_index(level=-1)["validFrom"].values
) / np.timedelta64(1, "D")

df2.head()

In [None]:
results = df2[df2["y"].notna()].reset_index()[["E", "y"]]
results["res"] = results["y"] - results["E"]
print(f'Absolute Error: {results["res"].mean():.1f} days')
print(f'RMSE: {(results["res"] ** 2).mean() ** 0.5:.1f} days')