In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import multivariate_logrank_test
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests

In [15]:
# Read in metadata and feature matrix
# Change to your own paths; feature_mat should have patient_id as rows and features as columns
metadata = pd.read_csv("C:/Users/HUIPU/Desktop/Team_2_HNSC/proj4/processed_metadata.txt", sep="\t", header=0)
metadata = metadata[metadata["hpv_status"] != "positive"]
metadata = metadata[metadata["tumor_site"] != "tonsil"]
res_dir = "C:/Users/HUIPU/Desktop/Team_2_HNSC/proj4"
#feature_mat = pd.read_csv("/Users/clairez/Desktop/BNFO285_Projects/project_4/Wenshu/test/sample_data/TCGA.HNSC.expression_feature_matrix.txt",
#                          sep="\t", header=0, index_col=0)

#features = feature_mat.columns.tolist()
# ==== 读入 CNV 状态矩阵并转置 ====
# 原文件：行 = 基因，列 = 样本（patient）
cnv = pd.read_csv("C:/Users/HUIPU/Desktop/Team_2_HNSC/proj4/filtered_cnv_status.tsv", sep="\t", header=0)

feature_mat = (
    cnv.set_index("gene_name")   # 如果第一列不是 gene 名，请改成实际列名
        .T                       # 转置：行→样本，列→基因
        .rename_axis("patient_id")
        .reset_index()           # patient_id 变成普通列，方便后面 merge
)

# 更新一下特征列表
features = feature_mat.columns.difference(["patient_id"]).tolist()

df = pd.merge(metadata, feature_mat, on="patient_id")
df = pd.get_dummies(df, columns=['gender'], drop_first=True)
df

Unnamed: 0,patient_id,age,OS,OS_time,DSS,DSS_time,DFI,DFI_time,tumor_site,tumor_class,...,ZW10,ZWILCH,ZWINT,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,gender_MALE
0,TCGA-4P-AA8J,66.0,0.0,102.0,0.0,102.0,,,tongue,primary,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,TCGA-BA-4074,69.0,1.0,462.0,1.0,462.0,,,tongue,primary,...,-1.0,0.0,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,True
2,TCGA-BA-4075,49.0,1.0,283.0,1.0,283.0,,,tongue,primary,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,TCGA-BA-4076,39.0,1.0,415.0,1.0,415.0,,,Larynx,primary,...,-1.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0,0.0,True
4,TCGA-BA-4077,45.0,1.0,1134.0,1.0,1134.0,,,tongue,primary,...,-1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,TCGA-UF-A7JT,72.0,1.0,993.0,1.0,993.0,,,mouth,metastasis,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
442,TCGA-UF-A7JV,62.0,1.0,90.0,1.0,90.0,,,Hypopharynx,recurrence,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
443,TCGA-UP-A6WW,58.0,0.0,518.0,0.0,518.0,,,tongue,primary,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
444,TCGA-WA-A7GZ,58.0,1.0,625.0,0.0,625.0,0.0,625.0,mouth,primary,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,True


In [16]:
# Univariate Cox Proportional Hazards Models
def univariate_cox(df, data_type, q_thresh = 0.05):
    results = []
    for feature in features:
        if(data_type in ("cna", "mutation")):
            feat = df[feature].dropna().astype(str)
            # one-hot encode this feature only
            dummies = pd.get_dummies(feat, prefix=feature, drop_first=True)
            cols = ['OS_time','OS'] + list(dummies.columns)

            data = pd.concat([df[['OS_time','OS']], dummies], axis=1).loc[:, cols].dropna()
            
            cph = CoxPHFitter()
            cph.fit(data,
                    duration_col='OS_time',
                    event_col='OS',
                    show_progress=False)
            
            for cov in dummies.columns:
                summ = cph.summary.loc[cov]
                results.append({
                    'feature':   feature,
                    'covariate': cov,
                    'coef':      summ['coef'],
                    'exp(coef)': summ['exp(coef)'],
                    'zscore':    summ['z'],
                    'p':     summ['p'],
                    'lower95%':  summ['exp(coef) lower 95%'],
                    'upper95%':  summ['exp(coef) upper 95%']
                })
        else:
            data = df[[feature, 'OS_time', 'OS']].dropna()

            cph = CoxPHFitter()
            cph.fit(data, duration_col='OS_time', event_col='OS', show_progress=False)
            summ = cph.summary.loc[feature]

            results.append({
                'feature':     feature,
                'coef':        summ['coef'],
                'exp(coef)':   summ['exp(coef)'],
                'zscore':      summ['z'],
                'p':           summ['p'],
                'lower 95%':   summ['exp(coef) lower 95%'],
                'upper 95%':   summ['exp(coef) upper 95%']
            })

    res_df = pd.DataFrame(results).set_index('feature')
    res_df["q"] = multipletests(res_df['p'], method='fdr_bh')[1]
    res_df = res_df.sort_values('q', ascending=True)
    res_df.to_csv(f'{res_dir}{data_type}_univariate_cox_results.tsv', sep="\t", index=True, header=True)
    sig_features = res_df[res_df['q'] < q_thresh]
    sig_features.to_csv(f'{res_dir}{data_type}_univariate_cox_significant_features.tsv', sep="\t", index=True, header=True)
    return sig_features

# Multivariate Cox Proportional Hazards Models
def multivariate_cox(df, data_type, q_thresh = 0.05):
    results = []
    for feature in features:
        if(data_type in ("cna", "mutation")):
            feat = df[feature].dropna().astype(str)
            # one-hot encode this feature only
            dummies = pd.get_dummies(feat, prefix=feature, drop_first=True)
            cols = ['OS_time','OS','age', 'tumor_stage'] \
                + [c for c in df if c.startswith('gender_')] \
                + list(dummies.columns)

            data = pd.concat([df[['OS_time','OS','age', 'tumor_stage'] + 
                             [c for c in df if c.startswith('gender_')]],
                            dummies], axis=1).loc[:, cols].dropna()
            
            cph = CoxPHFitter()
            cph.fit(data, duration_col='OS_time', event_col='OS', show_progress=False)
            
            for cov in dummies.columns:
                summ = cph.summary.loc[cov]
                results.append({
                    'feature':   feature,
                    'covariate': cov,
                    'coef':      summ['coef'],
                    'exp(coef)': summ['exp(coef)'],
                    'zscore':    summ['z'],
                    'p':     summ['p'],
                    'lower95%':  summ['exp(coef) lower 95%'],
                    'upper95%':  summ['exp(coef) upper 95%']
                })
        else:
            cols = ['OS_time','OS','age', 'tumor_stage'] \
                + [c for c in df.columns if c.startswith('gender_')] \
                + [feature]
            data = df[cols].dropna()

            cph = CoxPHFitter()
            cph.fit(data, duration_col='OS_time', event_col='OS', show_progress=False)
            summ = cph.summary.loc[feature]

            results.append({
                'feature':     feature,
                'coef':        summ['coef'],
                'exp(coef)':   summ['exp(coef)'],
                'zscore':      summ['z'],
                'p':           summ['p'],
                'lower 95%':   summ['exp(coef) lower 95%'],
                'upper 95%':   summ['exp(coef) upper 95%']
            })

    res_df = pd.DataFrame(results).set_index('feature')
    res_df["q"] = multipletests(res_df['p'], method='fdr_bh')[1]
    res_df = res_df.sort_values('q', ascending=True)
    res_df.to_csv(f'{res_dir}{data_type}_multivariate_cox_results.tsv', sep="\t", index=True, header=True)
    sig_features = res_df[res_df['q'] < q_thresh]
    sig_features.to_csv(f'{res_dir}{data_type}_multivariate_cox_significant_features.tsv', sep="\t", index=True, header=True)
    return sig_features

In [17]:
univariate_results = univariate_cox(df, "expression", q_thresh = 0.05)
multivariate_results = multivariate_cox(df, "expression", q_thresh = 0.05)

In [22]:
import pandas as pd
import numpy as np
from pathlib import Path
from lifelines import CoxPHFitter
from statsmodels.stats.multitest import multipletests
from tqdm import tqdm

# ========= 0. 结果目录 =========
res_dir = Path("C:/Users/HUIPU/Desktop/Team_2_HNSC/proj4/results/")
res_dir.mkdir(parents=True, exist_ok=True)

# ========= 1. 读取 metadata =========
meta = pd.read_csv(
    "C:/Users/HUIPU/Desktop/Team_2_HNSC/proj4/processed_metadata.txt",
    sep="\t"
)
meta = meta[(meta["hpv_status"] != "positive") & (meta["tumor_site"] != "tonsil")]

# ========= 2. 读取 & 转置 CNV =========
cnv = pd.read_csv(
    "C:/Users/HUIPU/Desktop/Team_2_HNSC/proj4/filtered_cnv_status.tsv",
    sep="\t"
)
feature_mat = (
    cnv.set_index("gene_name")          # 如果首列不是 gene_name，请改这里
       .T
       .rename_axis("patient_id")
       .reset_index()
)

# ========= 3. 合并 & one‑hot gender =========
df = pd.merge(meta, feature_mat, on="patient_id", how="inner")
df = pd.get_dummies(df, columns=["gender"], drop_first=True)

# ========= 4. 特征列表 =========
features = df.columns.difference(
    ["patient_id", "OS_time", "OS", "age", "tumor_stage"] +
    [c for c in df.columns if c.startswith("gender_")]
).tolist()

# ---------- 公共工具 ----------
def _drop_rare_dummies(df, min_count=3):
    """删除出现次数 < min_count 的 dummy 列"""
    keep = [c for c in df.columns if df[c].sum() >= min_count]
    return df[keep]

def _clean_design_matrix(tmp, event_col="OS", min_var=1e-8):
    """去掉零方差 + 完全分离的列"""
    # 1) 零/近零方差
    tmp = tmp.loc[:, tmp.var() > min_var]

    # 2) 纯事件/纯非事件列
    bad = []
    for c in tmp.columns.difference([event_col]):
        if tmp.groupby(event_col)[c].var().min() < min_var:
            bad.append(c)
    return tmp.drop(columns=bad, errors="ignore")

# ---------- 单变量 Cox ----------
def univariate_cox(df, features, data_type="cna", q_thresh=0.05):
    results, skipped = [], []

    for feat in tqdm(features, desc="Univariate"):
        try:
            if data_type in ("cna", "mutation"):
                dummies = pd.get_dummies(df[feat].astype(str), prefix=feat, drop_first=True)
                dummies = _drop_rare_dummies(dummies)
                if dummies.empty:
                    skipped.append((feat, "all rare"))
                    continue
                tmp = pd.concat([df[["OS_time", "OS"]], dummies], axis=1).dropna()
            else:
                tmp = df[["OS_time", "OS", feat]].dropna()

            if tmp.shape[0] < 10:
                skipped.append((feat, "n<10"))
                continue

            tmp = _clean_design_matrix(tmp)

            if tmp.shape[1] <= 2:       # 只有 OS_time、OS，没有协变量
                skipped.append((feat, "after_clean_none"))
                continue

            cph = CoxPHFitter(penalizer=1.0)   # 较强 L2 正则
            cph.fit(tmp, "OS_time", "OS", show_progress=False)

            covs = tmp.columns.difference(["OS_time", "OS"])
            for cov in covs:
                s = cph.summary.loc[cov]
                results.append({
                    "feature": feat,
                    "covariate": cov,
                    "coef": s["coef"],
                    "exp(coef)": s["exp(coef)"],
                    "z": s["z"],
                    "p": s["p"],
                    "lower95%": s["exp(coef) lower 95%"],
                    "upper95%": s["exp(coef) upper 95%"]
                })
        except Exception as e:
            skipped.append((feat, str(e).split("\n")[0]))
            continue

    full = pd.DataFrame(results)
    full["q"] = multipletests(full["p"], method="fdr_bh")[1] if not full.empty else np.nan
    full = full.sort_values("q")
    full.to_csv(res_dir / "cna_univariate_cox_results.tsv", sep="\t", index=False)

    sig = full[full["q"] < q_thresh]
    sig.to_csv(res_dir / "cna_univariate_cox_significant.tsv", sep="\t", index=False)

    pd.DataFrame(skipped, columns=["feature", "reason"]) \
      .to_csv(res_dir / "cna_univariate_skipped.tsv", sep="\t", index=False)

    print(f"[Univariate] 成功 {len(results)}，跳过 {len(skipped)}，显著 {sig.shape[0]}")
    return sig

# ---------- 多变量 Cox ----------
def multivariate_cox(df, sig_df, data_type="cna", q_thresh=0.05):
    base_cols = ["OS_time", "OS", "age", "tumor_stage"] + \
                [c for c in df.columns if c.startswith("gender_")]
    results, skipped = [], []

    for feat in sig_df["feature"].unique():
        try:
            if data_type in ("cna", "mutation"):
                dummies = pd.get_dummies(df[feat].astype(str), prefix=feat, drop_first=True)
                dummies = _drop_rare_dummies(dummies)
                if dummies.empty:
                    skipped.append((feat, "all rare"))
                    continue
                tmp = pd.concat([df[base_cols], dummies], axis=1).dropna()
            else:
                tmp = df[base_cols + [feat]].dropna()

            if tmp.shape[0] < 10:
                skipped.append((feat, "n<10"))
                continue

            tmp = _clean_design_matrix(tmp)

            if tmp.shape[1] <= len(base_cols):  # 协变量全被删除
                skipped.append((feat, "after_clean_none"))
                continue

            cph = CoxPHFitter(penalizer=1.0)
            cph.fit(tmp, "OS_time", "OS", show_progress=False)

            covs = tmp.columns.difference(base_cols)
            for cov in covs:
                s = cph.summary.loc[cov]
                results.append({
                    "feature": feat,
                    "covariate": cov,
                    "coef": s["coef"],
                    "exp(coef)": s["exp(coef)"],
                    "z": s["z"],
                    "p": s["p"],
                    "lower95%": s["exp(coef) lower 95%"],
                    "upper95%": s["exp(coef) upper 95%"]
                })
        except Exception as e:
            skipped.append((feat, str(e).split("\n")[0]))
            continue

    full = pd.DataFrame(results)
    full["q"] = multipletests(full["p"], method="fdr_bh")[1] if not full.empty else np.nan
    full = full.sort_values("q")
    full.to_csv(res_dir / "cna_multivariate_cox_results.tsv", sep="\t", index=False)

    sig = full[full["q"] < q_thresh]
    sig.to_csv(res_dir / "cna_multivariate_cox_significant.tsv", sep="\t", index=False)

    pd.DataFrame(skipped, columns=["feature", "reason"]) \
      .to_csv(res_dir / "cna_multivariate_skipped.tsv", sep="\t", index=False)

    print(f"[Multivariate] 成功 {len(results)}，跳过 {len(skipped)}，显著 {sig.shape[0]}")
    return sig

# ========= 5. 跑模型 =========
uni_sig  = univariate_cox(df, features, data_type="cna", q_thresh=0.05)
multi_sig = multivariate_cox(df, uni_sig, data_type="cna", q_thresh=0.05)


Univariate: 100%|██████████| 17788/17788 [53:11<00:00,  5.57it/s]  


[Univariate] 成功 35887，跳过 1，显著 0
[Multivariate] 成功 0，跳过 0，显著 0
