In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, auc
from scipy.interpolate import UnivariateSpline

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# drop num and G column
p_data = pd.read_csv("p_data.csv")
# p_data = p_data.drop(columns=['num'])
p_data = p_data.drop(columns=['G'])

# 补齐数据
# 有object数据，转换一下
p_data['PTA'] = pd.to_numeric(p_data['PTA'], errors='coerce')
p_data = p_data.dropna()
# # 使用KNNImputer填补缺失值
# imputer = KNNImputer(n_neighbors=10)
# p_data = pd.DataFrame(imputer.fit_transform(p_data), columns=p_data.columns)
# p_data.head(1000)

# # 分割数据集
X = p_data.drop(columns=['S'])
y = p_data['S']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
value_counts = y_train.value_counts()
plt.figure(figsize=(8, 6))
ax = value_counts.plot(kind='bar', color='skyblue', edgecolor='black')

for i in ax.containers:
    ax.bar_label(i, label_type='edge')

plt.title('Category Counts in Column S')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=0)  # 设置x轴标签的旋转角度
plt.show()

# 不采样

In [None]:
# 3. 初始化模型
models = {
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'xgboost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42),
    'lightGBM': lgb.LGBMClassifier(random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100, ),  # 设置隐藏层的神经元数量和层数
                          activation='relu',  # 激活函数为 ReLU
                          solver='adam',  # 优化器为 Adam
                          alpha=0.0001,  # L2正则化参数
                          max_iter=1000,  # 最大迭代次数
                          random_state=42),
}

plt.figure()

# 4. 训练模型并绘制ROC曲线
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_probs = model.predict_proba(X_test)[:, 1]  # 获得预测为正类的概率
    fpr, tpr, _ = roc_curve(y_test, y_probs)
    auc = roc_auc_score(y_test, y_probs)
    plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {auc:.2f})')

# 5. 绘制对角线
plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')

# 6. 设置图形参数
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# 采样
from imblearn import over_sampling

In [None]:
samplers = {
    "random": over_sampling.RandomOverSampler(random_state=42),
    "smote": over_sampling.SMOTE(random_state=42),
    "adasyn": over_sampling.ADASYN(random_state=42),
    "SVMSMOTE": over_sampling.SVMSMOTE(random_state=42),
}



models = {
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'xgboost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42),
    'lightGBM': lgb.LGBMClassifier(random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100, ),  # 设置隐藏层的神经元数量和层数
                          activation='relu',  # 激活函数为 ReLU
                          solver='adam',  # 优化器为 Adam
                          alpha=0.0001,  # L2正则化参数
                          max_iter=2000,  # 最大迭代次数
                          random_state=42),
}

plt.figure(figsize=(10, 10))
is_interpolate = 0
# 4. 训练模型并绘制ROC曲线
for model_name, model in models.items():
    for sampler_name, sp in samplers.items():
        if sp != "no":
            X_train, y_train = sp.fit_resample(X_train, y_train)
        model.fit(X_train, y_train)
        y_probs = model.predict_proba(X_test)[:, 1]  # 获得预测为正类的概率
        fpr, tpr, _ = roc_curve(y_test, y_probs)
        auc = roc_auc_score(y_test, y_probs)

        if is_interpolate:
            # 计算 ROC 曲线和 AUC
            fpr, tpr, _ = roc_curve(y_test, y_probs)
            # 使用样条插值进行平滑
            spline = UnivariateSpline(fpr, tpr)
            fpr = np.linspace(fpr.min(), fpr.max(), 50)
            tpr = spline(fpr)

        plt.plot(fpr, tpr, lw=1.5, label=f'{model_name} + {sampler_name} (AUC = {auc:.2f})')

# 5. 绘制对角线
plt.plot([0, 1], [0, 1], color='grey', lw=1.5, linestyle='--')

# 6. 设置图形参数
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# 归一化处理

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
# drop num and G column
p_data = pd.read_csv("p_data.csv")
p_data = p_data.drop(columns=['num'])
p_data = p_data.drop(columns=['G'])

# 补齐数据
# 有object数据，转换一下
p_data['PTA'] = pd.to_numeric(p_data['PTA'], errors='coerce')

# p_data = p_data.dropna()
t_data = p_data[p_data.notnull().all(axis=1)]
f_data = p_data[p_data.isnull().any(axis=1)]
# print("do not contain Nan data t_data.shape: ", t_data.shape, ", contain Nan data f_data.shape:", f_data.shape)

# imputer = KNNImputer(n_neighbors=5)
# p_data = pd.DataFrame(imputer.fit_transform(p_data), columns=p_data.columns)

# # 分割数据集
X = t_data.drop(columns=['S'])
y = t_data['S']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("X_train.shape:", X_train.shape, ", X_test.shape:", X_test.shape)

In [None]:
X_train = pd.concat([X_train, f_data.drop(columns=['S'])], axis=0)
y_train = pd.concat([y_train, f_data['S']], axis=0)
print("(X_train.shape), (X_test.shape) after concat null columns", X_train.shape, X_test.shape)
X_train.head(1000)

In [None]:
# # 使用KNNImputer填补缺失值
imputer = KNNImputer(n_neighbors=5)
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_train = X_train.fillna(X_train.mean()["PTA"])
# X_train = X_train.drop(columns=['PTA'])
# X_test = X_test.drop(columns=['PTA'])

X_train.head(1000)

In [None]:
samplers = {
    "no": -1,
    "random": over_sampling.RandomOverSampler(random_state=42),
    "smote": over_sampling.SMOTE(random_state=42),
    "adasyn": over_sampling.ADASYN(random_state=42),
    "SVMSMOTE": over_sampling.SVMSMOTE(random_state=42),
    "KMeansSMOTE": over_sampling.KMeansSMOTE(random_state=42),
    "BorderlineSMOTE": over_sampling.BorderlineSMOTE(random_state=42),
    # "SMOTENC": over_sampling.SMOTENC(random_state=42)
}

scalers = {
    "no": -1,
    "minmax": MinMaxScaler(),
    "standard": StandardScaler()
}

models = {
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'xgboost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42),
    'xgboost2': xgb.XGBClassifier(use_label_encoder=True, eval_metric='auc', random_state=42),
    'lightGBM': lgb.LGBMClassifier(random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100, ),  # 设置隐藏层的神经元数量和层数
                          activation='relu',  # 激活函数为 ReLU
                          solver='adam',  # 优化器为 Adam
                          alpha=0.0001,  # L2正则化参数
                          max_iter=10000,  # 最大迭代次数
                          random_state=42),
}

plt.figure(figsize=(10, 10))
# 4. 训练模型并绘制ROC曲线
for model_name, model in models.items():
    for sampler_name, sp in samplers.items():
        if sampler_name != "no":
            X_tmp, y_tmp = sp.fit_resample(X_train, y_train)
            model.fit(X_tmp, y_tmp)
        else:
            model.fit(X_train, y_train)
        y_probs = model.predict_proba(X_test)[:, 1]  # 获得预测为正类的概率
        fpr, tpr, _ = roc_curve(y_test, y_probs)
        auc = roc_auc_score(y_test, y_probs)

        plt.plot(fpr, tpr, lw=1.5, label=f'{model_name} + {sampler_name} (AUC = {auc:.4f})')

# 5. 绘制对角线
plt.plot([0, 1], [0, 1], color='grey', lw=1.5, linestyle='--')

# 6. 设置图形参数
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

### 筛选出lightGBM

In [None]:
import optuna

In [None]:
# 定义目标函数
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 4, 15),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
    }

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_test, label=y_test, reference=dtrain)

    # 训练模型
    gbm = lgb.train(param, dtrain, valid_sets=[dvalid])

    # 预测验证集
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

    # 计算 AUC
    auc = roc_auc_score(y_test, y_pred)

    return auc

# 创建 Optuna 的 study 对象
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, timeout=6000)

# 输出最佳超参数
print('Number of finished trials: ', len(study.trials))
print('Best trial:')
trial = study.best_trial

print('  Value: ', trial.value)

print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

In [None]:
trial.params

In [None]:
rs = over_sampling.RandomOverSampler(random_state=42)

X_tmp, y_tmp = rs.fit_resample(X_train, y_train)

dtrain = lgb.Dataset(X_tmp, label=y_tmp)
dvalid = lgb.Dataset(X_test, label=y_test, reference=dtrain)

# 训练模型
gbm = lgb.train(trial.params, dtrain, valid_sets=[dvalid])

# 预测验证集
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

fpr, tpr, _ = roc_curve(y_test, y_probs)
auc = roc_auc_score(y_test, y_probs)

plt.plot(fpr, tpr, lw=1.5, label=f'{model_name} (AUC = {auc:.4f})')
# 5. 绘制对角线
plt.plot([0, 1], [0, 1], color='grey', lw=1.5, linestyle='--')

# 6. 设置图形参数
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# drop num and G column
p_data = pd.read_csv("p_data.csv")
p_data = p_data.drop(columns=['num'])
p_data = p_data.drop(columns=['G'])

# 补齐数据
# 有object数据，转换一下
p_data['PTA'] = pd.to_numeric(p_data['PTA'], errors='coerce')

p_data = p_data.dropna()
# t_data = p_data[p_data.notnull().all(axis=1)]
# f_data = p_data[p_data.isnull().any(axis=1)]
# print("do not contain Nan data t_data.shape: ", t_data.shape, ", contain Nan data f_data.shape:", f_data.shape)

# imputer = KNNImputer(n_neighbors=5)
# p_data = pd.DataFrame(imputer.fit_transform(p_data), columns=p_data.columns)

# # 分割数据集
X = p_data.drop(columns=['S'])
y = p_data['S']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("X_train.shape:", X_train.shape, ", X_test.shape:", X_test.shape)

In [18]:
samplers = {
    "no": -1,
    "random": over_sampling.RandomOverSampler(random_state=42),
    "smote": over_sampling.SMOTE(random_state=42),
    "adasyn": over_sampling.ADASYN(random_state=42),
    "SVMSMOTE": over_sampling.SVMSMOTE(random_state=42),
    "KMeansSMOTE": over_sampling.KMeansSMOTE(random_state=42),
    "BorderlineSMOTE": over_sampling.BorderlineSMOTE(random_state=42),
    # "SMOTENC": over_sampling.SMOTENC(random_state=42)
}


models = {
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'xgboost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42),
    'xgboost2': xgb.XGBClassifier(use_label_encoder=True, eval_metric='auc', random_state=42),
    'lightGBM': lgb.LGBMClassifier(random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(32, 16, 8),  # 设置隐藏层的神经元数量和层数
                          activation='relu',  # 激活函数为 ReLU
                          solver='adam',  # 优化器为 Adam
                          alpha=0.0001,  # L2正则化参数
                          max_iter=10000,  # 最大迭代次数
                          random_state=42),
}

model_categories = [model_name for model_name, _ in models.items()]
aucs = {
    sampler_name:[] for sampler_name, _ in samplers.items()
}

# 4. 训练模型并绘制ROC曲线
for model_name, model in models.items():
    for sampler_name, sp in samplers.items():
        if sampler_name != "no":
            X_tmp, y_tmp = sp.fit_resample(X_train, y_train)
            model.fit(X_tmp, y_tmp)
        else:
            model.fit(X_train, y_train)
        y_probs = model.predict_proba(X_test)[:, 1]  # 获得预测为正类的概率
        fpr, tpr, _ = roc_curve(y_test, y_probs)
        auc = roc_auc_score(y_test, y_probs)

        aucs[sampler_name].append(auc)

X_train.shape: (182, 20) , X_test.shape: (77, 20)


In [19]:
aucs

Unnamed: 0,diagnosis,Gender,HBeAg,Age,ALB,Tbil,ALT,AST,GGT,ALP,...,lg.sAg,WBC,NEUT,HGB,PLT,PT,PTA,INR,APTT,AFP
0,1,0,1,25,40.6,8.6,46.2,25.2,13.8,59.0,...,4.245,4.75,2.49,136,242,14.1,88.00,1.08,35.8,3.18
1,0,1,1,30,48.0,13.2,54.0,30.0,17.0,64.0,...,4.166,5.50,3.30,161,246,12.3,114.00,0.92,38.3,1.50
2,0,1,1,29,48.0,17.6,252.0,96.0,45.0,97.0,...,4.398,10.10,5.30,182,299,14.4,82.00,1.13,38.6,4.52
3,1,0,1,49,38.0,7.7,31.0,22.0,26.0,76.0,...,2.853,7.20,4.50,126,183,13.5,93.00,1.04,36.1,1.80
4,0,1,1,27,47.0,23.6,657.0,309.0,432.0,96.0,...,4.868,6.00,3.50,145,211,13.5,93.00,1.04,40.0,5.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,1,1,0,45,45.2,22.7,74.9,52.9,137.1,109.7,...,3.343,3.50,2.10,160,77,12.6,81.40,1.09,29.3,3.28
178,1,1,1,28,41.5,12.1,24.6,25.6,64.4,162.3,...,2.634,5.40,2.70,135,159,13.6,98.91,1.07,35.2,44.03
179,1,0,1,65,36.2,23.9,272.0,58.6,155.9,181.9,...,2.398,4.14,2.12,148,309,11.9,126.00,0.88,34.5,9.12
180,0,1,0,39,45.0,10.3,70.0,24.0,33.0,133.0,...,3.767,7.20,4.50,156,222,12.7,112.00,0.94,36.6,3.40


In [None]:
# 设置每个大柱子的位置和宽度
x = np.arange(len(model_categories))
# width = 0.1  # 每个小柱子的宽度
num_groups = len(samplers)  # 每个大柱子分成的小组数
bar_width = 0.1  # 每个小柱子的宽度
# 绘图
fig, ax = plt.subplots(figsize=(15, 6))
max_auc = 0
max_index = 0
max_bar = None
for index, (sampler_name, _) in enumerate(samplers.items()):
    ax.bar(x + index * bar_width - (num_groups - 1) * bar_width / 2, aucs[sampler_name], bar_width, label=sampler_name)

# 添加标签、标题和图例
ax.set_xlabel('Model Categories')
ax.set_ylabel('AUC ')
ax.set_title('models with different sample methods')
ax.set_xticks(x)
ax.set_xticklabels(model_categories)
ax.legend()

# 显示图形
plt.tight_layout()
plt.show()