# Aggregation for Excel visualisation

This notebook combines sentence-level predictions with page counts, aggregates by company/year/label, and computes metrics across multiple probability thresholds.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path


## Load combined predictions

`combine_csvs.py` output should be in `outputs/final_result.csv`.

In [None]:
preds_path = Path("outputs/final_result.csv")
df = pd.read_csv(preds_path)
df.head()

## Reshape to long format (label + probability) and compute word counts

In [None]:
label_map = {
    "prob_fin_label": "fin",
    "prob_soc_label": "soc",
    "prob_env_label": "env",
    "prob_maori_label": "maori",
}

df["word_count"] = df["sentence"].astype(str).str.split().str.len()

long_df = df.melt(
    id_vars=["company", "year", "word_count"],
    value_vars=list(label_map.keys()),
    var_name="label",
    value_name="probability",
)
long_df["label"] = long_df["label"].map(label_map)
long_df = long_df.dropna(subset=["label", "probability"])
long_df.head()


## Basic checks and required columns

In [None]:
required_cols = {"company", "year", "sentence", "prob_fin_label", "prob_soc_label", "prob_env_label", "prob_maori_label"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
df = df.dropna(subset=["company", "year", "sentence"])
df.head()


## Load and aggregate page counts

`merge_page_counts.py` output should be in `outputs/page_counts_merged.csv`. Pages are summed per company/year.

In [None]:
pages_path = Path("outputs/page_counts_merged.csv")
pages = pd.read_csv(pages_path)

required_pages = {"company", "year", "pages"}
missing_pages = required_pages - set(pages.columns)
if missing_pages:
    raise ValueError(f"Missing page count columns: {missing_pages}")

pages["year"] = pd.to_numeric(pages["year"], errors="coerce").astype("Int64")
pages["pages"] = pd.to_numeric(pages["pages"], errors="coerce")
page_counts = (
    pages.dropna(subset=["company", "year", "pages"])
    .groupby(["company", "year"], as_index=False)["pages"]
    .sum()
)
page_counts.head()

## Aggregation helper

In [None]:
BASE_YEAR = int(df["year"].min())

def summarize_over_threshold(df_long: pd.DataFrame, threshold: float) -> pd.DataFrame:
    over_mask = df_long["probability"] > threshold
    enriched = df_long.assign(
        over=over_mask,
        over_word_count=lambda d: d["word_count"].where(over_mask, 0),
    )

    agg = (
        enriched
        .groupby(["company", "year", "label"])
        .agg(
            total_sentences=("probability", "size"),
            total_words=("word_count", "sum"),
            over_count=("over", "sum"),
            over_share=("over", "mean"),
            over_word_count=("over_word_count", "sum"),
            mean_prob_over=("probability", lambda s: s[s > threshold].mean() if (s > threshold).any() else 0.0),
            sum_prob_over=("probability", lambda s: s[s > threshold].sum()),
        )
        .reset_index()
    )

    agg = agg.merge(page_counts, on=["company", "year"], how="left")
    agg["pages"] = agg["pages"].replace({0: pd.NA})

    agg["over_sentence_word_share"] = agg["over_count"] / agg["total_words"]
    agg["over_word_share"] = agg["over_word_count"] / agg["total_words"]
    agg["sentences_per_page"] = agg["total_sentences"] / agg["pages"]
    agg["over_sentences_per_page"] = agg["over_count"] / agg["pages"]
    agg["over_words_per_page"] = agg["over_word_count"] / agg["pages"]

    agg["theme_rate_per_1000_words"] = np.where(
        agg["total_words"] > 0,
        (agg["over_count"] / agg["total_words"]) * 1000,
        0.0,
    )
    non_theme = agg["total_sentences"] - agg["over_count"]
    agg["theme_ratio_per_1000_sentences"] = np.where(
        non_theme > 0,
        (agg["over_count"] / non_theme) * 1000,
        0.0,
    )

    base_counts = (
        agg.loc[agg["year"] == BASE_YEAR, ["label", "company", "over_count"]]
        .drop_duplicates(subset=["label", "company"])
        .set_index(["label", "company"])["over_count"]
    )
    agg["theme_index_base"] = agg.apply(
        lambda row: (row["over_count"] / base_counts.get((row["label"], row["company"]), 0)) * 100
        if base_counts.get((row["label"], row["company"]), 0) > 0 else 0.0,
        axis=1,
    )

    agg["avg_theme_score_relevant"] = agg["mean_prob_over"]
    agg["threshold"] = threshold
    return agg


## Build summaries for all thresholds

In [None]:
thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
summaries = [summarize_over_threshold(long_df, thr) for thr in thresholds]
summary_all = pd.concat(summaries, ignore_index=True)
summary_all.head()


## Save for Excel

In [None]:
output_path = Path("outputs/aggregation_thresholds.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
summary_all.to_csv(output_path, index=False)
print(f"Saved: {output_path}")