In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib

# === 1. 读取表达矩阵（仅保留 AUC > 0.6 的基因）===
data_path = "merged_train.csv"  # 请按你的路径修改
X = pd.read_csv(data_path)

# === 2. 读取 metadata 并整合样本标签 ===
metadata_dir = "../metadata"
metadata_files = [
    "E-MTAB-316_metadata.csv",
    "GSE5900_metadata.csv",
    "GSE6477_metadata.csv",
    "GSE13591_metadata.csv"
]

metadata_all = pd.concat([
    pd.read_csv(os.path.join(metadata_dir, file))
    for file in metadata_files
], ignore_index=True)

# 仅保留 SampleID 和 label 两列，并排除 label 缺失
metadata_all = metadata_all[["SampleID", "label"]]
metadata_all = metadata_all[metadata_all["label"].notna()]

# === 3. 构建 y ===
label_df = metadata_all.set_index("SampleID")
y = X["SampleID"].map(label_df["label"])

# === 4. 丢弃无标签的样本 ===
valid_idx = y.notna()
y = y[valid_idx].astype(int)
X_model = X.loc[valid_idx].drop(columns=["SampleID", "Dataset"], errors="ignore")

# === 5. 构建 Pipeline：标准化 + 随机森林 ===
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42))
])

# === 6. 设置调参网格 ===
param_grid = {
    "clf__n_estimators": [100, 200, 500],
    "clf__max_depth": [None, 10, 20, 30],
    "clf__min_samples_split": [2, 5, 10]
}

# === 7. 进行网格搜索（5-Fold）===
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring="roc_auc",
    n_jobs=-1,
    verbose=1
)
grid.fit(X_model, y)
best_model = grid.best_estimator_

# === 8. 手动 5-Fold 验证评分 ===
scores = {
    "ROC_AUC": [],
    "ACCURACY": [],
    "F1": [],
    "PRECISION": [],
    "RECALL": []
}

for train_idx, test_idx in cv.split(X_model, y):
    X_train, X_test = X_model.iloc[train_idx], X_model.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    scores["ROC_AUC"].append(roc_auc_score(y_test, y_proba))
    scores["ACCURACY"].append(accuracy_score(y_test, y_pred))
    scores["F1"].append(f1_score(y_test, y_pred))
    scores["PRECISION"].append(precision_score(y_test, y_pred))
    scores["RECALL"].append(recall_score(y_test, y_pred))

# === 9. 输出评估结果 ===
print("\n✅ Random Forest 5-Fold 平均评估结果：")
for metric, values in scores.items():
    print(f"{metric}: {np.mean(values):.4f}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits

✅ Random Forest 5-Fold 平均评估结果：
ROC_AUC: 0.8731
ACCURACY: 0.7558
F1: 0.8080
PRECISION: 0.7177
RECALL: 0.9253


In [2]:

# === 10. 保存模型 ===
joblib.dump(best_model, "random_forest_allgenesmodel.pkl")
print("\n✅ 模型已保存为 random_forest_allgenesmodel.pkl")


✅ 模型已保存为 random_forest_allgenesmodel.pkl


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer, accuracy_score, f1_score, precision_score,
    recall_score, roc_auc_score
)
import joblib
import os

# === 1. 读取表达数据 ===
data_path = "merged_train.csv"
X = pd.read_csv(data_path)

# === 2. 读取标签 metadata ===
metadata_dir = "../../metadata"
metadata_files = [
    "E-MTAB-316_metadata.csv",
    "GSE5900_metadata.csv",
    "GSE6477_metadata.csv",
    "GSE13591_metadata.csv"
]

metadata_all = pd.concat([
    pd.read_csv(os.path.join(metadata_dir, file))
    for file in metadata_files
], ignore_index=True)

metadata_all = metadata_all[["SampleID", "label"]]
metadata_all = metadata_all[metadata_all["label"].notna()]

# === 3. 构建标签向量 y ===
label_df = metadata_all.set_index("SampleID")
y = X["SampleID"].map(label_df["label"])
valid_idx = y.notna()
y = y[valid_idx].astype(int)
X_model = X.loc[valid_idx].drop(columns=["SampleID", "Dataset"], errors="ignore")

# === 4. 构建 stacking 模型 ===
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('svc', SVC(kernel='rbf', probability=True, random_state=42))
]

final_estimator = LogisticRegressionCV(cv=5, max_iter=1000)

stacking = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", StackingClassifier(
        estimators=estimators,
        final_estimator=final_estimator,
        cv=5,
        n_jobs=-1,
        passthrough=False
    ))
])

# === 5. 定义评分指标 ===
scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === 6. 交叉验证 ===
results = cross_validate(stacking, X_model, y, cv=cv, scoring=scoring, return_train_score=False)

# === 7. 输出平均结果 ===
print("\n✅ Stacking Classifier 5-Fold 评估结果:")
for metric in scoring.keys():
    score_values = results[f'test_{metric}']
    print(f"{metric.upper()}: {np.mean(score_values):.4f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

P


✅ Stacking Classifier 5-Fold 评估结果:
ROC_AUC: 0.8618
ACCURACY: 0.7420
F1: 0.8006
PRECISION: 0.7046
RECALL: 0.9296


In [2]:
# === 8. 训练全量模型并保存 ===
stacking.fit(X_model, y)
joblib.dump(stacking, "stacking_allgenesmodel.pkl")
print("\n✅ 模型已保存为 stacking_allgenesmodel.pkl")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




✅ 模型已保存为 stacking_allgenesmodel.pkl


In [11]:
import os
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# 1. 模型路径（仅保留两个）
model_paths = {
    "RandomForest": "random_forest_allgenesmodel.pkl",
    "Stacking": "stacking_allgenesmodel.pkl"
}

# 2. 加载测试集
test_df = pd.read_csv("merged_test.csv")

# 3. 加载标签
metadata_dir = "../../metadata"
metadata_files = [
    "E-MTAB-317_metadata.csv",
    "GSE2113_metadata.csv",
    "GSE235356_metadata.csv",
]
metadata_all = pd.concat([
    pd.read_csv(os.path.join(metadata_dir, f))[["SampleID", "label"]] for f in metadata_files
])
metadata_all = metadata_all.dropna(subset=["label"])
metadata_all["label"] = metadata_all["label"].astype(int)

# 4. 构建测试集 X 和 y
label_df = metadata_all.set_index("SampleID")
y_test = test_df["SampleID"].map(label_df["label"])
valid_idx = y_test.notna()
X_test = test_df.loc[valid_idx].drop(columns=["SampleID"])
y_test = y_test[valid_idx].astype(int)

# 5. 测试函数
def evaluate_model(name, model, X, y):
    if hasattr(model, "feature_names_in_"):
        X = X.loc[:, model.feature_names_in_]
    else:
        try:
            scaler = model.named_steps["scaler"]
            X = X.loc[:, scaler.feature_names_in_]
        except:
            pass
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else None
    return {
        "Model": name,
        "Accuracy": accuracy_score(y, y_pred),
        "F1": f1_score(y, y_pred),
        "Precision": precision_score(y, y_pred),
        "Recall": recall_score(y, y_pred),
        "ROC AUC": roc_auc_score(y, y_prob) if y_prob is not None else "N/A"
    }

# 6. 批量测试模型
results = []
for name, path in model_paths.items():
    model = joblib.load(path)
    score = evaluate_model(name, model, X_test, y_test)
    results.append(score)

# 7. 输出
results_df = pd.DataFrame(results)
print(results_df)

          Model  Accuracy        F1  Precision    Recall   ROC AUC
0  RandomForest  0.700000  0.797927   0.735669  0.871698  0.687064
1      Stacking  0.684615  0.784588   0.732026  0.845283  0.697419


In [None]:
import os
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# === 1. 加载训练集和测试集 ===
train_df = pd.read_csv("merged_train.csv")
test_df = pd.read_csv("merged_test.csv")

# === 2. 合并所有 metadata 获取标签 ===
metadata_dir = "../../metadata"
metadata_files = [f for f in os.listdir(metadata_dir) if f.endswith(".csv")]

metadata_all = pd.concat([
    pd.read_csv(os.path.join(metadata_dir, f))[["SampleID", "label"]] for f in metadata_files
])
metadata_all = metadata_all.dropna(subset=["label"])
metadata_all["label"] = metadata_all["label"].astype(int)
label_df = metadata_all.set_index("SampleID")

# === 3. 构造 X_train, y_train, X_test, y_test ===
y_train = train_df["SampleID"].map(label_df["label"])
y_test = test_df["SampleID"].map(label_df["label"])

train_valid = y_train.notna()
test_valid = y_test.notna()

X_train = train_df.loc[train_valid].drop(columns=["SampleID"])
X_test = test_df.loc[test_valid].drop(columns=["SampleID"])

# 🚫 防止非数值型特征干扰
X_train = X_train.select_dtypes(include=["number"])
X_test = X_test.select_dtypes(include=["number"])

y_train = y_train[train_valid].astype(int)
y_test = y_test[test_valid].astype(int)

# === 4. 构建带乘积交互项的 base 模型 Pipeline ===
def make_base_model(clf):
    return Pipeline([
        ("poly", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
        ("scaler", StandardScaler()),
        ("clf", clf)
    ])

base_learners = [
    ("rf", make_base_model(RandomForestClassifier(n_estimators=100, random_state=42))),
    ("svm", make_base_model(SVC(kernel='linear', probability=True)))
]

final_estimator = LogisticRegression(max_iter=1000)

stacking_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=final_estimator,
    cv=5,
    n_jobs=-1
)

# === 5. 模型训练 ===
print("Training stacking model with interaction features...")
stacking_model.fit(X_train, y_train)

# === 6. 保存模型 ===
joblib.dump(stacking_model, "stacking_with_interactions.pkl")

# === 7. 模型评估 ===
y_pred = stacking_model.predict(X_test)
y_prob = stacking_model.predict_proba(X_test)[:, 1]

results = {
    "Model": "Stacking_with_Interaction",
    "Accuracy": accuracy_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "ROC AUC": roc_auc_score(y_test, y_prob)
}

results_df = pd.DataFrame([results])
print("\n=== Evaluation on Test Set ===")
print(results_df)

Training stacking model with interaction features...
