In [1]:
# 导入所需的库
import sklearn
import pandas as pd
import numpy as np
import os
import sys
import pickle

# 导入模型
from sklearn.svm import SVC  # 支持向量机
from sklearn.linear_model import LogisticRegression  # 逻辑回归
from sklearn.tree import DecisionTreeClassifier  # 决策树分类器
from sklearn.neural_network import MLPClassifier  # 多层感知机（神经网络）
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier  # 导入 XGBClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Support Vector machine

In [2]:
with open("dis_data_df.pkl", "rb") as f:
    data_dict = pickle.load(f)

data_dict = {k:v[np.load('infection_fea_619.npy')] for k,v in data_dict.items()}
for k,v in data_dict.items():
    v['label'] = k
dataset = pd.concat(data_dict.values())

# 步骤 1: 标签转换 (如果标签是字符串，则需要转换为数值)
label_encoder = LabelEncoder()
dataset['label'] = label_encoder.fit_transform(dataset['label'])

# 步骤 2: 切分特征和标签
X = dataset.drop('label', axis=1)  # 特征部分
y = dataset['label']  # 标签部分

# 使用 sklearn 工具将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [3]:
svm_model = SVC()

# 定义超参数搜索的参数网格
param_grid = {
    'C': [0.1, 0.5, 1],  # 惩罚参数
    'kernel': ['linear', 'rbf','poly'],  # 核函数类型
    'gamma': ['scale', 'auto'],  # 核函数的系数
    'class_weight':['balanced'],
    'shrinking':[True],
    'probability':[True],
    'cache_size':[2048],
    'max_iter':[1000],
    'tol':[1e-4]
}

# 使用 GridSearchCV 进行网格搜索
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy')

# 训练模型
grid_search.fit(X_train, y_train)

# 获取最佳模型
best_model = grid_search.best_estimator_

# 打印最好的参数
print("Best Parameters:\n", grid_search.best_params_)

# 在测试集上评估效果
y_pred = best_model.predict(X_test)

# 输出每一类的准确度
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)



Best Parameters:
 {'C': 0.5, 'cache_size': 2048, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'poly', 'max_iter': 1000, 'probability': True, 'shrinking': True, 'tol': 0.0001}
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96        49
           1       0.99      1.00      1.00       225
           2       1.00      0.98      0.99        84
           3       0.99      1.00      1.00       156
           4       1.00      1.00      1.00        46
           5       1.00      1.00      1.00       167
           6       1.00      1.00      1.00       217
           7       1.00      0.99      1.00       200
           8       0.99      0.99      0.99       137

    accuracy                           0.99      1281
   macro avg       0.99      0.99      0.99      1281
weighted avg       0.99      0.99      0.99      1281





# LogisticRegression

In [4]:
log_reg_model = LogisticRegression()

# 定义参数网格
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],  # 正则化类型
    'C': [0.01, 0.1, 1,],                   # 正则化强度
    'solver': ['liblinear', 'lbfgs', 'saga'],       # 求解器类型
    'max_iter': [1000],                # 最大迭代次数
    'multi_class': ['auto', 'ovr', 'multinomial'],  # 多类处理方式
    'class_weight': ['balanced'],             # 类别权重
    'tol': [1e-4],                      # 收敛容忍度
    'warm_start': [True, False],                    # 热启动
    'n_jobs': [-1]                         # CPU 核心数量
}

# 使用 GridSearchCV 进行网格搜索
grid_search = GridSearchCV(estimator=log_reg_model, param_grid=param_grid, cv=5, scoring='accuracy')

# 训练模型
grid_search.fit(X_train, y_train)

# 获取最佳模型
best_model = grid_search.best_estimator_

# 打印最好的参数
print("Best Parameters:\n", grid_search.best_params_)

# 在测试集上评估效果
y_pred = best_model.predict(X_test)

# 输出每一类的准确度
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

420 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Anaconda\envs\bioinfo\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda\envs\bioinfo\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "D:\Anaconda\envs\bioinfo\lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\Anaconda\envs\bioinfo\lib\site-packages\sklearn\linear_model\_logistic.py", line 63, in _che

Best Parameters:
 {'C': 1, 'class_weight': 'balanced', 'max_iter': 1000, 'multi_class': 'ovr', 'n_jobs': -1, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.0001, 'warm_start': True}
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96        49
           1       0.98      0.95      0.96       225
           2       0.96      0.96      0.96        84
           3       0.99      0.99      0.99       156
           4       1.00      1.00      1.00        46
           5       0.99      0.98      0.98       167
           6       0.97      0.99      0.98       217
           7       1.00      0.99      1.00       200
           8       0.94      0.97      0.95       137

    accuracy                           0.98      1281
   macro avg       0.97      0.98      0.98      1281
weighted avg       0.98      0.98      0.98      1281



# XGBOOST

In [None]:
# 定义超参数网格
param_grid = {
    'n_estimators': [50, 100, 200],         # 弱学习器的个数
    'max_depth': [3, 5, 7],                 # 树的最大深度
    'learning_rate': [0.01, 0.05, 0.1],     # 学习率
    'subsample': [0.8],                     # 随机选择样本的比例
    'colsample_bytree': [0.8],              # 每棵树训练时随机选择的特征的比例
    'gamma': [0, 0.1, 0.2],                 # 在节点分裂时，指定一个阈值
    'reg_alpha': [0, 0.1, 1],               # L1 正则化系数
    'reg_lambda': [1, 1.5, 2],              # L2 正则化系数
    'scale_pos_weight': [1, 2],             # 处理类别不平衡
    'objective': ['multi:softmax'],         # 多分类任务（softmax）
    'num_class': [9],                       # 类别数，设置为目标类别的数量
    'eval_metric': ['mlogloss'],            # 多分类的评估指标：多类对数损失
    'tree_method': ['gpu_hist'],            # 使用 GPU 加速
    'predictor': ['gpu_predictor'],         # 使用 GPU 预测
}

# 使用 XGBClassifier
xgb_model = XGBClassifier()

# 使用 GridSearchCV 进行网格搜索
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')

# 训练模型
grid_search.fit(X_train, y_train)

# 获取最佳模型
best_model = grid_search.best_estimator_

# 打印最好的参数
print("Best Parameters:\n", grid_search.best_params_)

# 在测试集上评估效果
y_pred = best_model.predict(X_test)

# 输出每一类的准确度
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor", "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=