In [1]:
import pandas as pd

INPUT = "mashup_summary_public.csv"
OUTPUT = "trend_summary.csv"

df = pd.read_csv(INPUT)

# ---------------------------
#   - student/professional: depression_rate
#   - general_proxy: mental_illness_history_rate_proxy
# ---------------------------
keep = (
    ((df["source_dataset"].isin(["student", "professional"])) & (df["metric_readable"] == "depression_rate")) |
    ((df["source_dataset"] == "general_proxy") & (df["metric_readable"] == "mental_illness_history_rate_proxy"))
)
df = df[keep].copy()

def weighted_rate(g):
    return (g["rate"] * g["n"]).sum() / g["n"].sum()

agg = (df.groupby(["source_dataset", "metric_readable", "family_history_flag", "financial_bucket"], as_index=False)
         .apply(lambda g: pd.Series({"rate": weighted_rate(g), "n": int(g["n"].sum())}))
         .reset_index(drop=True))

invert = {"Low": "High", "Medium": "Medium", "High": "Low"}
agg["hardship_bucket"] = agg["financial_bucket"]
mask_general = agg["source_dataset"] == "general_proxy"
agg.loc[mask_general, "hardship_bucket"] = agg.loc[mask_general, "financial_bucket"].map(invert)

order = ["Low", "Medium", "High"]

rows = []
for (src, metric, fh), g in agg.groupby(["source_dataset", "metric_readable", "family_history_flag"]):
    p = g.set_index("hardship_bucket").reindex(order)

    low = p.loc["Low", "rate"] if "Low" in p.index else float("nan")
    med = p.loc["Medium", "rate"] if "Medium" in p.index else float("nan")
    high = p.loc["High", "rate"] if "High" in p.index else float("nan")

    delta = high - low if pd.notna(high) and pd.notna(low) else float("nan")
    slope = (high - low) / 2 if pd.notna(high) and pd.notna(low) else float("nan")

    monotonic = False
    if pd.notna(low) and pd.notna(med) and pd.notna(high):
        monotonic = (low <= med <= high) or (low >= med >= high)

    direction = "flat"
    if pd.notna(delta):
        if delta > 0:
            direction = "increasing"
        elif delta < 0:
            direction = "decreasing"

    n_low = int(p.loc["Low", "n"]) if pd.notna(p.loc["Low", "n"]) else 0
    n_med = int(p.loc["Medium", "n"]) if pd.notna(p.loc["Medium", "n"]) else 0
    n_high = int(p.loc["High", "n"]) if pd.notna(p.loc["High", "n"]) else 0

    rows.append({
        "source_dataset": src,
        "metric": metric,
        "family_history_flag": int(fh),
        "rate_low": low,
        "rate_medium": med,
        "rate_high": high,
        "n_low": n_low,
        "n_medium": n_med,
        "n_high": n_high,
        "delta_high_minus_low": delta,
        "slope_approx": slope,
        "monotonic": monotonic,
        "direction": direction,
        "axis": "hardship_bucket" 
    })

trend = pd.DataFrame(rows).sort_values(["family_history_flag", "source_dataset"]).reset_index(drop=True)
trend.to_csv(OUTPUT, index=False)

print("Saved ->", OUTPUT)
print(trend)


Saved -> trend_summary.csv
  source_dataset                             metric  family_history_flag  \
0  general_proxy  mental_illness_history_rate_proxy                    0   
1   professional                    depression_rate                    0   
2        student                    depression_rate                    0   
3  general_proxy  mental_illness_history_rate_proxy                    1   
4   professional                    depression_rate                    1   
5        student                    depression_rate                    1   

   rate_low  rate_medium  rate_high   n_low  n_medium  n_high  \
0  0.219097     0.309171   0.381652  101375    102034   99106   
1  0.003571          NaN   0.044534     280         0     247   
2  0.357063     0.558303   0.732996    5338      2710    6322   
3  0.220012     0.311748   0.386815   36548     35888   38817   
4  0.007246          NaN   0.064039     276         0     203   
5  0.392872     0.624003   0.782651    4826      2

  .apply(lambda g: pd.Series({"rate": weighted_rate(g), "n": int(g["n"].sum())}))
