In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import make_scorer, recall_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import tqdm
# Define models and their hyperparameter search spaces
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': Integer(10, 100),
            'max_depth': Integer(3, 20),
            'min_samples_split': Integer(2, 10)
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': Integer(10, 100),
            'max_depth': Integer(3, 20),
            'learning_rate': Real(0.01, 1.0, 'log-uniform'),
            'subsample': Real(0.5, 1.0)
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=10000),
        'params': {
            'C': Real(1e-6, 1e+6, prior='log-uniform'),
            'penalty': Categorical(['l2']),
            'solver': Categorical(['lbfgs', 'saga'])
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': Real(1e-6, 1e+6, prior='log-uniform'),
            'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
            'gamma': Real(1e-6, 1e+1, prior='log-uniform')
        }
    },
    'NaiveBayes': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': Real(1e-9, 1e-7, prior='log-uniform')
        }
    }
}

# Custom scorer for recall
recall_scorer = make_scorer(recall_score)

# Dummy dataset (replace with your actual dataset)
X = np.loadtxt('boruta_10.txt', delimiter=',')
y = np.loadtxt("../data/y_train.txt", delimiter=' ')

# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_results = {}

for name, model_info in models.items():
    model = model_info['model']
    params = model_info['params']
    
    # Bayesian optimization with cross-validation
    opt = BayesSearchCV(
        estimator=model,
        search_spaces=params,
        scoring=recall_scorer,
        cv=kf,
        n_iter=30,
        n_jobs=-1,
        random_state=42
    )
    
    opt.fit(X, y)
    
    best_results[name] = {
        'best_score': opt.best_score_,
        'best_params': opt.best_params_
    }
    
    print(f"Model: {name}")
    print(f"Best Recall Score: {opt.best_score_}")
    print(f"Best Params: {opt.best_params_}")
    print("-" * 30)

# Print overall best results
print("\nOverall Best Results:")
for model_name, result in best_results.items():
    print(f"Model: {model_name}")
    print(f"Best Recall Score: {result['best_score']}")
    print(f"Best Params: {result['best_params']}")
    print("-" * 30)




Model: RandomForest
Best Recall Score: 0.7205707305583529
Best Params: OrderedDict([('max_depth', 20), ('min_samples_split', 10), ('n_estimators', 99)])
------------------------------
Model: XGBoost
Best Recall Score: 0.7149460809219426
Best Params: OrderedDict([('learning_rate', 0.01), ('max_depth', 20), ('n_estimators', 67), ('subsample', 0.5)])
------------------------------
Model: LogisticRegression
Best Recall Score: 0.4901562975032891
Best Params: OrderedDict([('C', 3932.2516133086), ('penalty', 'l2'), ('solver', 'saga')])
------------------------------
