<a href="https://colab.research.google.com/github/NINGTANG1124/UPF-HFI/blob/main/notebooks/PSW%E2%80%94HFI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# connect to googledrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# --- 0. 导入库 ---
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

# 用于倾向评分估计 & 平衡表
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances


In [None]:
# 读取
survey_df = pd.read_excel( "/content/drive/MyDrive/UPF-HFI/Model/outcome/survey_with_HFI.xlsx")
df_upf    = pd.read_excel("/content/drive/MyDrive/UPF-HFI/Model/outcome/weighted_upf_percent.xlsx")

In [None]:
# ==== 先检查 ====
issues = []

# 检查关键列
for col in ["UserID_clean", "HFI_binary", "HFI_raw_score"]:
    if col not in survey_df.columns:
        issues.append(f"survey 缺少 {col}")

for col in ["UserID_clean", "weighted_upf_percent"]:
    if col not in df_upf.columns:
        issues.append(f"upf 缺少 {col}")

# 检查 ID 是否有缺失/重复
for name, df in [("survey", survey_df), ("upf", df_upf)]:
    n_na = df["UserID_clean"].isna().sum()
    n_dup = df["UserID_clean"].duplicated().sum()
    if n_na:  issues.append(f"{name} 的 UserID_clean 有 {n_na} 个缺失")
    if n_dup: issues.append(f"{name} 的 UserID_clean 有 {n_dup} 个重复")

# 检查 upf_percent 是否数值
bad_upf = pd.to_numeric(df_upf["weighted_upf_percent"], errors="coerce").isna().sum()
if bad_upf:
    issues.append(f"weighted_upf_percent 有 {bad_upf} 个非数值/缺失")

# 打印检查结果
if issues:
    print("检查发现以下问题：")
    for i, msg in enumerate(issues, 1):
        print(f"{i}. {msg}")
else:
    print("没有发现问题，可以直接处理")


没有发现问题，可以直接处理


In [None]:
# ==== 检查关键变量缺失情况 ====
check_cols = ["HFI_binary", "weighted_upf_percent"]
missing_info = {}

for col in check_cols:
    if col not in survey_df.columns and col not in df_upf.columns:
        print(f"数据中找不到列: {col}")
        continue

    if col in survey_df.columns:
        n_miss = survey_df[col].isna().sum()
        missing_info[col] = n_miss
        print(f"{col} 在 survey_df 中缺失 {n_miss} 行")

    if col in df_upf.columns:
        n_miss = df_upf[col].isna().sum()
        missing_info[col] = n_miss
        print(f"{col} 在 upf_df 中缺失 {n_miss} 行")

# ==== 判断是否需要处理 ====
if all(v == 0 for v in missing_info.values()):
    print("两列都没有缺失，可以直接进入合并/建模")
else:
    print("存在缺失，需要处理后再合并")


HFI_binary 在 survey_df 中缺失 18 行
weighted_upf_percent 在 upf_df 中缺失 0 行
存在缺失，需要处理后再合并


In [None]:
# 1. 合并 survey 和 upf
df_model = pd.merge(survey_df, df_upf, on="UserID_clean", how="inner")

# 2. 删掉 HFI_binary 缺失的
df_model = df_model.dropna(subset=["HFI_binary"])

print("合并后样本量：", len(df_model))


合并后样本量： 308


In [None]:
# 查看族裔变量的原始分布
df_model["ethn_participant"].value_counts(dropna=False)


Unnamed: 0_level_0,count
ethn_participant,Unnamed: 1_level_1
1,156
3,76
7,33
2,13
4,10
5,8
6,6
8,4
9,2


In [None]:
# 根据 BiB 文献做的三类
df_model["ethnicity3"] = df_model["ethn_participant"].replace({
    1: "White British",
    3: "Pakistani"
})
df_model["ethnicity3"] = df_model["ethnicity3"].where(
    df_model["ethnicity3"].isin(["White British", "Pakistani"]), "Other"
)

df_model["ethnicity3"].value_counts()


Unnamed: 0_level_0,count
ethnicity3,Unnamed: 1_level_1
White British,156
Other,76
Pakistani,76


In [None]:
# 检查主要分析变量缺失
main_vars = [
    "HFI_binary", "weighted_upf_percent",
    "gender_participant", "age_participant", "ethnicity3"
]

missing_info = {}
for col in main_vars:
    n_miss = df_model[col].isna().sum()
    missing_info[col] = n_miss

df_main = df_model[main_vars].dropna().copy()
print("主分析样本量:", len(df_main))

主分析样本量: 308


# 建模

## 0) 预备：映射族裔（WB / PK / Mixed），构建两套数据集

In [74]:
import numpy as np, pandas as pd
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression

df = df_model.copy()

# 族裔映射（按你现在用的）
eth_map = {1:"White British", 3:"Pakistani", 7:"Mixed"}
df["ethnicity3"] = df["ethn_participant"].map(eth_map).fillna("Other")

# 主分析样本：只保留 WB/PK（可选加 Mixed，看你决定）
df_core = (
    df.loc[df["ethnicity3"].isin(["White British","Pakistani","Mixed"]),
           ["HFI_binary","weighted_upf_percent","age_participant",
            "gender_participant","ethnicity3"]]
      .dropna()
      .copy()
)

print(df_core["ethnicity3"].value_counts())


ethnicity3
White British    156
Pakistani         76
Mixed             33
Name: count, dtype: int64


# PS（仅人口学）→ 稳定化权重 → 主效应（Core-A）

In [75]:
def fit_ps_and_sw(df_in, x_cols, treat="HFI_binary"):
    X = pd.get_dummies(df_in[x_cols], drop_first=True)
    y = df_in[treat].astype(int).values
    m = LogisticRegression(max_iter=1000).fit(X, y)
    ps = m.predict_proba(X)[:,1]
    p1 = y.mean()
    sw = np.where(y==1, p1/ps, (1-p1)/(1-ps))
    return sw, ps

# 1.1 PS 只含人口学
sw_core, ps_core = fit_ps_and_sw(df_core, ["age_participant","gender_participant","ethnicity3"])

# 1.2 主效应（PSW + HC3）
m_coreA = smf.wls("weighted_upf_percent ~ HFI_binary", data=df_core, weights=sw_core)\
             .fit(cov_type="HC3")
d = m_coreA.params["HFI_binary"]
lo, hi = m_coreA.conf_int().loc["HFI_binary"].tolist()
print(f"[Core-A | PSW(人口学)] Δ = {d:.2f} pp (95%CI {lo:.2f}, {hi:.2f})")


[Core-A | PSW(人口学)] Δ = 1.83 pp (95%CI -1.45, 5.10)


# 2) 协变量平衡（SMD 简表，用于 Table 1/附录）

In [76]:
def _w_mean(x,w): return np.sum(w*x)/np.sum(w)
def _w_var(x,w,mu=None):
    if mu is None: mu=_w_mean(x,w)
    return np.sum(w*(x-mu)**2)/np.sum(w)

def _smd(x,t,w=None):
    x=np.asarray(x,float); t=np.asarray(t,int)
    if w is None: w=np.ones_like(x)
    m1=_w_mean(x[t==1],w[t==1]); m0=_w_mean(x[t==0],w[t==0])
    v1=_w_var(x[t==1],w[t==1],m1); v0=_w_var(x[t==0],w[t==0],m0)
    s=np.sqrt((v1+v0)/2+1e-9); return (m1-m0)/s

def balance_table(df_in, covs, treat="HFI_binary", sw=None):
    t=df_in[treat].astype(int).values; w=sw if sw is not None else None
    rows=[]
    for c in covs:
        col=df_in[c]
        if col.dtype=="O" or str(col.dtype).startswith("category"):
            dummies=pd.get_dummies(col, drop_first=False)
            for k in dummies.columns:
                x=dummies[k].values
                rows.append([f"{c}={k}", _smd(x,t,None), _smd(x,t,w)])
        else:
            x=col.values
            rows.append([c, _smd(x,t,None), _smd(x,t,w)])
    return pd.DataFrame(rows, columns=["Covariate","SMD(unw)","SMD(PSW)"])

bal = balance_table(df_core, ["age_participant","gender_participant","ethnicity3"], sw=sw_core)
bal


Unnamed: 0,Covariate,SMD(unw),SMD(PSW)
0,age_participant,0.168897,-0.01731
1,gender_participant,0.008786,0.001347
2,ethnicity3=Mixed,0.0556,0.003244
3,ethnicity3=Pakistani,0.236478,0.00462
4,ethnicity3=White British,-0.258255,-0.006417


# Core-B（唯一交互 HFI × Ethnicity；仍然是“解释性/附录”，非主结论）

In [80]:
m_coreB = smf.wls("weighted_upf_percent ~ HFI_binary * C(ethnicity3)",
                  data=df_core, weights=sw_core).fit(cov_type="HC3")
print(m_coreB.summary().tables[1])

# 分族群内的 Δpp
rows=[]
for g in ["White British","Pakistani", "Mixed"]:
    sub = df_core[df_core["ethnicity3"]==g].copy()
    swg = sw_core[df_core["ethnicity3"]==g]
    mm = smf.wls("weighted_upf_percent ~ HFI_binary", data=sub, weights=swg).fit(cov_type="HC3")
    lo, hi = mm.conf_int().loc["HFI_binary"].tolist()
    rows.append([g, mm.params["HFI_binary"], lo, hi])
coreB_by_eth = pd.DataFrame(rows, columns=["Ethnicity","Δ (pp)","95% CI lower","95% CI upper"]).round(2)
coreB_by_eth


                                                coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------
Intercept                                    66.1332      2.944     22.466      0.000      60.364      71.903
C(ethnicity3)[T.Pakistani]                   -1.3195      3.502     -0.377      0.706      -8.184       5.545
C(ethnicity3)[T.White British]                5.7143      3.138      1.821      0.069      -0.436      11.864
HFI_binary                                    8.2725      4.933      1.677      0.094      -1.396      17.941
HFI_binary:C(ethnicity3)[T.Pakistani]        -6.9849      5.687     -1.228      0.219     -18.132       4.162
HFI_binary:C(ethnicity3)[T.White British]    -7.5063      5.408     -1.388      0.165     -18.106       3.094


Unnamed: 0,Ethnicity,Δ (pp),95% CI lower,95% CI upper
0,White British,0.77,-3.58,5.11
1,Pakistani,1.29,-4.26,6.84
2,Mixed,8.27,-1.4,17.94


# 4) 稳健性（这一步才提 SES；一句话，不让 SES 抢主角）

In [81]:
# SES 只做稳健性：在 PSW 的基础上，额外把 income、employment 放入 PS
df_ses = df.loc[df["ethnicity3"].isin(["White British","Pakistani"]),
                ["HFI_binary","weighted_upf_percent","age_participant",
                 "gender_participant","ethnicity3","income","employ"]].dropna().copy()

# 收入可离散为分位（更稳），失败则退回原分类
try:
    df_ses["income_q"] = pd.qcut(df_ses["income"].rank(method="first"), 5, labels=list("Q12345"))
    X_ses = ["age_participant","gender_participant","ethnicity3","income_q","employ"]
except Exception:
    df_ses["income_q"] = df_ses["income"].astype(str)
    X_ses = ["age_participant","gender_participant","ethnicity3","income_q","employ"]

sw_ses, _ = fit_ps_and_sw(df_ses, X_ses)

m_ses = smf.wls("weighted_upf_percent ~ HFI_binary", data=df_ses, weights=sw_ses).fit(cov_type="HC3")
d_s = m_ses.params["HFI_binary"]; lo_s, hi_s = m_ses.conf_int().loc["HFI_binary"].tolist()
atten = (d - d_s)/d*100 if d!=0 else np.nan
print(f"[稳健性 | PSW(人口学+SES)] Δ = {d_s:.2f} pp (95%CI {lo_s:.2f}, {hi_s:.2f}); 衰减 {atten:.1f}%")


[稳健性 | PSW(人口学+SES)] Δ = 1.81 pp (95%CI -2.14, 5.77); 衰减 0.8%
