In [2]:
import numpy as np
import pandas as pd
import os
import json
from datetime import datetime
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import make_scorer, recall_score, accuracy_score
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import tqdm
import pickle

import warnings
warnings.filterwarnings('ignore')

# Define models and their hyperparameter search spaces
models = {
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': Integer(10, 100),
            'max_depth': Integer(3, 50),
            'min_samples_split': Integer(2, 100)
        }
    },
    'XGBoost': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': Integer(10, 100),
            'max_depth': Integer(3, 50),
            'learning_rate': Real(0.001, 1.0, 'log-uniform'),
            'subsample': Real(0.1, 1.0)
        }
    },
    'XGBRF': {
        'model': XGBRFClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': Integer(10, 100),
            'max_depth': Integer(3, 20),
            'learning_rate': Real(0.01, 1.0, 'log-uniform'),
            'subsample': Real(0.5, 1.0)
        }
    }
}

# Custom scorer for recall
recall_scorer = make_scorer(recall_score)

# Dataset
# with open('./reduced_data/X_boruta_cfs.pickle', 'rb') as handle:
#     X = pickle.load(handle)

columns = [100, 101, 102, 103, 105]

X = np.loadtxt('../data/x_train.txt', delimiter=' ')
X = X[:, columns]
y = np.loadtxt("../data/y_train.txt", delimiter=' ')




# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_results = {}

for name, model_info in models.items():
    model = model_info['model']
    params = model_info['params']
    
    # Bayesian optimization with cross-validation
    opt = BayesSearchCV(
        estimator=model,
        search_spaces=params,
        scoring=recall_scorer,
        cv=kf,
        n_iter=30,
        n_jobs=-1,
        random_state=42
    )
    
    opt.fit(X, y)
    
    best_model = opt.best_estimator_
    y_pred = best_model.predict(X)
    
    best_recall = recall_score(y, y_pred)
    best_accuracy = accuracy_score(y, y_pred)
    
    best_results[name] = {
        'best_score': opt.best_score_,
        'best_params': opt.best_params_,
        'recall': best_recall,
        'accuracy': best_accuracy
    }
    
    print(f"Model: {name}")
    print(f"Best Recall Score (CV): {opt.best_score_}")
    print(f"Best Params: {opt.best_params_}")
    print(f"Recall: {best_recall}")
    print(f"Accuracy: {best_accuracy}")
    print("-" * 30)

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

# Create a dictionary to hold information about the data
data_info = {
    'data_shape': X.shape,
    'columns/file': columns,
    'target_distribution': dict(zip(*np.unique(y, return_counts=True)))
}

# Combine data info with best results
combined_results = {'data_info': data_info, 'best_results': best_results}

# Create results directory if it doesn't exist
os.makedirs('results', exist_ok=True)

# Get current date and hour
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# Define the filename
filename = f"results/results_{current_time}.json"

# Save the results to a file
with open(filename, 'w') as f:
    json.dump(combined_results, f, indent=4, cls=NpEncoder)

print(f"Results saved to {filename}")

Model: RandomForest
Best Recall Score (CV): 0.7183264552423497
Best Params: OrderedDict([('max_depth', 50), ('min_samples_split', 100), ('n_estimators', 100)])
Recall: 0.7584134615384616
Accuracy: 0.7344
------------------------------
Model: XGBoost
Best Recall Score (CV): 0.7072109723219584
Best Params: OrderedDict([('learning_rate', 0.003683838981460599), ('max_depth', 44), ('n_estimators', 100), ('subsample', 0.24080934678256175)])
Recall: 0.7852564102564102
Accuracy: 0.7654
------------------------------
Model: XGBRF
Best Recall Score (CV): 0.6856879930914901
Best Params: OrderedDict([('learning_rate', 0.3264176523920099), ('max_depth', 19), ('n_estimators', 73), ('subsample', 0.5)])
Recall: 0.8261217948717948
Accuracy: 0.835
------------------------------
Results saved to results/results_2024-05-29_15-13-36.json
