In [5]:
from AMPpred_MFA.lib.Data import *
from AMPpred_MFA.lib.Visualization import colorful_print, current_time, draw_roc
from AMPpred_MFA.lib.Encoding import AAC
from AMPpred_MFA.lib.Visualization import *
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    StackingClassifier,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
import joblib
import math
from xgboost import XGBClassifier

CPU_NUM_CORES = joblib.cpu_count(only_physical_cores=True)
print(f"Number of physical cores: {CPU_NUM_CORES}")

Number of physical cores: 14


In [6]:
file_path_pos = './dataset/our_dataset/amps.fasta'
file_path_neg = './dataset/our_dataset/non_amps.fasta'
file_path_train = './dataset/train/1_trial/train.fasta'
file_path_test = './dataset/test/our_testset/1_trial/test.fasta'

# 使用GridSearchCV进行网格搜索
def k_fold_grid_search(model, param_grid, X_train, y_train, k_fold=5):
    scoring = 'accuracy'
    grid_search = GridSearchCV(estimator=model,
                               param_grid=param_grid,
                               scoring=scoring,
                               cv=k_fold,
                               verbose=1)
    grid_search.fit(X_train, y_train)
    return grid_search

X_train, y_train = build_dataset_from_format(file_path_train,
                                             feature_function=AAC)

In [7]:
DT = "Decision Tree"
RF = 'Random Forest'
ADABOOST = 'AdaBoost'
XGBOOST = 'XGBoost'
STACKING = 'Stacking'
models = {
    DT: DecisionTreeClassifier(),
    RF: RandomForestClassifier(n_jobs=CPU_NUM_CORES),
    ADABOOST: AdaBoostClassifier(),
    XGBOOST: XGBClassifier(n_jobs=CPU_NUM_CORES),
}

params_grid = {
     DT: {
        "criterion": ["gini", "entropy"],
        "max_depth": [6, 10, 20, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2", None],
    },
    RF: {
        'n_estimators': [100, 200, 300],
        'criterion': ['gini', 'entropy'],
        'max_depth': [6, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
    },
    ADABOOST: {
        'n_estimators': [100, 150, 200, 300],
        'algorithm': ['SAMME', 'SAMME.R'],
        'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.5]
    },
    XGBOOST: {
        'n_estimators': [100, 200, 300],
        'gamma': [0, 0.5, 1],
        'min_child_weight': [1, 3, 5, 10],
        'max_depth': [6, 10, 20],
        'subsample': [0.6, 0.8, 1],
        'colsample_bytree': [0.6, 0.8, 1],
        'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.5]
    },
}

In [8]:
grids_search = {}
for name in models:
    model = models[name]
    param_grid = params_grid[name]
    grid_search = k_fold_grid_search(
        model, param_grid, X_train, y_train, k_fold=3)
    grids_search[name] = grid_search
    print("{}'s best param: {}".format(name, grid_search.best_params_))


Fitting 3 folds for each of 216 candidates, totalling 648 fits
Decision Tree's best param: {'criterion': 'gini', 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
Fitting 3 folds for each of 648 candidates, totalling 1944 fits
Random Forest's best param: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Fitting 3 folds for each of 40 candidates, totalling 120 fits
AdaBoost's best param: {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 300}
Fitting 3 folds for each of 4860 candidates, totalling 14580 fits
XGBoost's best param: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 300, 'subsample': 1}
