In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("mashup_summary_public.csv")

order = ["Low", "Medium", "High"]

def weighted_rate(g):
    return (g["rate"] * g["n"]).sum() / g["n"].sum()

# agg：dataset × family_history × financial_bucket
agg = (df.groupby(["source_dataset","metric_readable","family_history_flag","financial_bucket"], as_index=False)
         .apply(lambda g: pd.Series({"rate": weighted_rate(g), "n": g["n"].sum()}))
         .reset_index(drop=True))

# keep only 3 series
keep = (
    ((agg["source_dataset"].isin(["student","professional"])) & (agg["metric_readable"]=="depression_rate")) |
    ((agg["source_dataset"]=="general_proxy") & (agg["metric_readable"]=="mental_illness_history_rate_proxy"))
)
agg = agg[keep].copy()

# unifie hardship_bucket
# student/pro: financial_bucket
agg["hardship_bucket"] = agg["financial_bucket"]

# general: financial_bucket represent “income”，it needs to be reversed as “hardship”
invert = {"Low":"High", "Medium":"Medium", "High":"Low"}
mask_general = agg["source_dataset"] == "general_proxy"
agg.loc[mask_general, "hardship_bucket"] = agg.loc[mask_general, "financial_bucket"].map(invert)

# draw：x=hardship_bucket（hardship from Low→High）
for fh in [0, 1]:
    sub = agg[agg["family_history_flag"]==fh].copy()
    sub["hardship_bucket"] = pd.Categorical(sub["hardship_bucket"], categories=order, ordered=True)

    plt.figure()
    for src in ["student","professional","general_proxy"]:
        s = sub[sub["source_dataset"]==src].sort_values("hardship_bucket")
        if s.empty:
            continue
        label = src if src != "general_proxy" else "general_proxy (proxy; hardship derived from income)"
        plt.plot(s["hardship_bucket"], s["rate"], marker="o", label=label)

    plt.xlabel("hardship_bucket (Low=better, High=worse)")
    plt.ylabel("rate")
    plt.title(f"Trend check on unified hardship axis (family_history={fh})\nNOTE: general uses proxy outcome")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"plot_hardship_trend_family_{fh}.png", dpi=200)
    plt.close()

print("Done! Generated: plot_hardship_trend_family_0.png and plot_hardship_trend_family_1.png")

  .apply(lambda g: pd.Series({"rate": weighted_rate(g), "n": g["n"].sum()}))


Done! Generated: plot_hardship_trend_family_0.png and plot_hardship_trend_family_1.png
