In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
df = pd.read_csv('final_selection.csv')
df["icu_status"] = df["icustay_seq"].replace({2:1, np.nan:0, 3:1, 4:1, 5:1,1:1})

In [2]:

df = df[df["icustay_seq"]==1]
df = df.sort_values(by=['subject_id', 'admittime']).groupby('subject_id').head(1)
# df = df[df["icustay_seq"]==1]
df.drop_duplicates(subset=['subject_id'], inplace=True)

In [3]:
import numpy as np  
# df["icustay_seq"] = df["icustay_seq"].replace({2:1, np.nan:0, 3:1, 4:1, 5:1})
df['gender'] = df['gender'].replace({'M': 1, 'F': 0})
df['dod'] = pd.to_datetime(df['dod'], format='%d/%m/%Y', errors='coerce')
df['admittime'] = pd.to_datetime(df['admittime'], format='%d/%m/%Y %H:%M:%S', errors='coerce')
df['dischtime'] = pd.to_datetime(df['dischtime'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

# 先计算死亡时间
df['survival_time'] = (df['dod'] - df['admittime']).dt.days

# 缺失（即生存/删失）时用出院时间
df['survival_time'] = np.where(
    df['survival_time'].isna(),
    (df['dischtime'] - df['admittime']).dt.days,
    df['survival_time']
)
df['outcome'] = df['dod'].notna().astype(int)
# 创建survival_365列
df['survival_30'] = ((df['outcome'] == 0) | (df['survival_time'] >= 30)).astype(int)

# 打印分布情况
print("survival_30布:")
print(df['survival_30'].value_counts())


survival_30布:
survival_30
1    836
0    301
Name: count, dtype: int64


  df['gender'] = df['gender'].replace({'M': 1, 'F': 0})


In [None]:
import pandas as pd
import numpy as np

# 首先计算四分位数组
df['tibc_quantile_group'] = pd.qcut(df['tibc'], q=4, labels=False) + 1
df['ferritin_quantile_group'] = pd.qcut(df['ferritin'], q=4, labels=False) + 1
df['iron_quantile_group'] = pd.qcut(df['iron'], q=4, labels=False) + 1
df['transferrin_quantile_group'] = pd.qcut(df['transferrin'], q=4, labels=False) + 1

# 获取tibc四个亚组的统计信息
tibc_stats = df.groupby('tibc_quantile_group')['tibc'].agg([
    ('count', 'count'),          # 每个亚组的样本数
    ('median', 'median'),        # 中位数
    ('lower_bound', 'min'),      # 下限（最小值）
    ('upper_bound', 'max')       # 上限（最大值）
]).reset_index()

print("tibc四个亚组的统计信息:")
print(tibc_stats)


ferritin_stats = df.groupby('ferritin_quantile_group')['ferritin'].agg([
    ('count', 'count'),          # 每个亚组的样本数
    ('median', 'median'),        # 中位数
    ('lower_bound', 'min'),      # 下限（最小值）
    ('upper_bound', 'max')       # 上限（最大值）
]).reset_index()    
print("ferritin四个亚组的统计信息:")
print(ferritin_stats)

iron_stats = df.groupby('iron_quantile_group')['iron'].agg([
    ('count', 'count'),          # 每个亚组的样本数
    ('median', 'median'),        # 中位数
    ('lower_bound', 'min'),      # 下限（最小值）
    ('upper_bound', 'max')       # 上限（最大值）
]).reset_index()    
print("iron四个亚组的统计信息:")
print(iron_stats)



tibc四个亚组的统计信息:
   tibc_quantile_group  count  median  lower_bound  upper_bound
0                    1    287   126.0           29          152
1                    2    282   174.0          153          195
2                    3    284   217.0          196          247
3                    4    284   288.0          248          751
ferritin四个亚组的统计信息:
   ferritin_quantile_group  count  median  lower_bound  upper_bound
0                        1    285   122.0          8.0        223.0
1                        2    284   349.5        224.0        545.0
2                        3    284   850.0        546.0       1319.0
3                        4    284  2437.0       1321.0     200205.0
iron四个亚组的统计信息:
   iron_quantile_group  count  median  lower_bound  upper_bound
0                    1    301    16.0            5           22
1                    2    272    29.0           23           37
2                    3    280    47.0           38           66
3                    4    284   104

In [41]:
# 保留非缺失值不少于 80% 的列（thresh 为最小非 NA 数量）
df.dropna(axis=1, thresh=int(len(df) * 0.8), inplace=True)

In [42]:
df.drop(columns=['dischtime','admittime','hospital_expire_flag',
                 'subject_id','hadm_id','icustay_seq', "malignant_tumor", "icu_status"], inplace=True)


In [22]:
df = df[df['survival_time']>0]

In [43]:
# 方法1 — 一行筛选
cols_with_na = df.columns[df.dtypes.eq('float64') & df.isnull().any()].tolist()
print(cols_with_na)

['neutrophils', 'lymphocytes', 'alt', 'ast', 'total_bilirubin', 'albumin', 'bun', 'creatinine', 'glucose', 'sodium', 'chloride', 'free_calcium', 'total_calcium', 'pao2', 'pco2', 'ph', 'lactate', 'anion_gap', 'inr', 'pt', 'ptt', 'resp_rate', 'temperature', 'weight_admit']


In [44]:
import miceforest as mf
from sklearn.model_selection import train_test_split

# 需要插补的列
cols_to_impute = cols_with_na

# 用重置索引的数据
df_reset = df.reset_index(drop=True)



# 为每个需要插补的列指定预测变量（通常为除自身以外的所有列）
variable_schema = {
    col: [c for c in df_reset.columns if c != col]
    for col in cols_to_impute
}

kernel = mf.ImputationKernel(
    df_reset,
    random_state=1991,
    variable_schema=variable_schema
)

kernel.mice(iterations=5, n_imputations=5)

# 拿到完整数据，但只有 cols_to_impute 会被替换为插补结果
completed = kernel.complete_data(iteration=0)


In [45]:
completed.to_csv('final_imputed_with_sofa_firsticu.csv', index=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier
X = df.drop(columns=["survival_time","outcome","survival_30"])
y = df["survival_30"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
# 2. 在训练集上创建并“拟合”miceforest内核
# 这一步，miceforest只从X_train中学习插补模型
kds_train = mf.ImputationKernel(
    data=X_train,
    random_state=42
)

# 运行MICE算法（例如5次迭代）
kds_train.mice(iterations=5)

# 3. 获取插补后的训练集（通常取最后一次迭代的结果）
X_train_imputed = kds_train.complete_data()



In [None]:
from sklearn.metrics import roc_auc_score, auc, roc_curve
from tabpfn import TabPFNClassifier
# 4. 关键步骤：使用在训练集上拟合好的内核来转换测试集
# 注意：我们不会在测试集上重新运行MICE迭代，只是应用训练好的模型
X_test_imputed = kds_train.impute_new_data(X_test)

# 获取填充后的测试集数据
X_test_imputed = X_test_imputed.complete_data()

# 5. 现在，使用处理好的数据训练和评估模型
model = TabPFNClassifier()
model.fit(X_train_imputed, y_train)
score = model.score(X_test_imputed, y_test)

print(f"模型在测试集上的准确率: {score:.4f}") # 这个结果是可信的
# 预测概率（用于AUC计算）
y_pred_proba = model.predict_proba(X_test_imputed)[:, 1]  # 取正类的概率

# 预测类别（用于准确率计算）
y_pred = model.predict(X_test_imputed)

# 计算准确率
accuracy = model.score(X_test_imputed, y_test)

# 计算AUC
auc_score = roc_auc_score(y_test, y_pred_proba)

print(f"模型在测试集上的准确率: {accuracy:.4f}")
print(f"模型在测试集上的AUC值: {auc_score:.4f}")

# 可选：绘制ROC曲线
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)