In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import make_scorer, recall_score, accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import tqdm
import pickle

# Define models and their hyperparameter search spaces
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': Integer(10, 100),
            'max_depth': Integer(3, 20),
            'min_samples_split': Integer(2, 10)
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': Integer(10, 100),
            'max_depth': Integer(3, 20),
            'learning_rate': Real(0.01, 1.0, 'log-uniform'),
            'subsample': Real(0.5, 1.0)
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=10000),
        'params': {
            'C': Real(1e-6, 1e+6, prior='log-uniform'),
            'penalty': Categorical(['l2']),
            'solver': Categorical(['lbfgs', 'saga'])
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': Real(1e-6, 1e+6, prior='log-uniform'),
            'kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
            'gamma': Real(1e-6, 1e+1, prior='log-uniform')
        }
    },
    # 'NaiveBayes': {
    #     'model': GaussianNB(),
    #     'params': {
    #         'var_smoothing': Real(1e-9, 1e-7, prior='log-uniform')
    #     }
    # }
}

# Custom scorer for recall
recall_scorer = make_scorer(recall_score)

# Dataset
with open('./reduced_data/X_boruta_cfs.pickle', 'rb') as handle:
    X = pickle.load(handle)
    
# X = np.loadtxt('boruta_10.txt', delimiter=',')
y = np.loadtxt("../data/y_train.txt", delimiter=' ')

# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_results = {}

for name, model_info in models.items():
    model = model_info['model']
    params = model_info['params']
    
    # Bayesian optimization with cross-validation
    opt = BayesSearchCV(
        estimator=model,
        search_spaces=params,
        scoring=recall_scorer,
        cv=kf,
        n_iter=30,
        n_jobs=-1,
        random_state=42
    )
    
    opt.fit(X, y)
    
    best_model = opt.best_estimator_
    y_pred = best_model.predict(X)
    
    best_recall = recall_score(y, y_pred)
    best_accuracy = accuracy_score(y, y_pred)
    
    best_results[name] = {
        'best_score': opt.best_score_,
        'best_params': opt.best_params_,
        'recall': best_recall,
        'accuracy': best_accuracy
    }
    
    print(f"Model: {name}")
    print(f"Best Recall Score (CV): {opt.best_score_}")
    print(f"Best Params: {opt.best_params_}")
    print(f"Recall: {best_recall}")
    print(f"Accuracy: {best_accuracy}")
    print("-" * 30)


Model: RandomForest
Best Recall Score (CV): 0.640834482177938
Best Params: OrderedDict([('max_depth', 20), ('min_samples_split', 10), ('n_estimators', 59)])
Recall: 0.8549679487179487
Accuracy: 0.903
------------------------------
Model: XGBoost
Best Recall Score (CV): 0.6430681496477189
Best Params: OrderedDict([('learning_rate', 0.023318165476353517), ('max_depth', 20), ('n_estimators', 42), ('subsample', 0.5)])
Recall: 0.8301282051282052
Accuracy: 0.842
------------------------------
Model: LogisticRegression
Best Recall Score (CV): 0.4899514066549727
Best Params: OrderedDict([('C', 11185.625288472094), ('penalty', 'l2'), ('solver', 'lbfgs')])
Recall: 0.4907852564102564
Accuracy: 0.5138
------------------------------


KeyboardInterrupt: 