In [39]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
from collections import OrderedDict
import pickle
import json
from collections import OrderedDict
from xgboost import XGBRFClassifier
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
import os
import json
from datetime import datetime
import random


### Data

In [56]:

# Create a synthetic dataset
# with open('./reduced_data/X_boruta_cfs.pickle', 'rb') as handle:
#     X = pickle.load(handle)

columns = [100, 101, 102, 103]
X = np.loadtxt("../data/x_train.txt", delimiter=' ')
X = X[:, columns]
y = np.loadtxt("../data/y_train.txt", delimiter=' ')

## params

In [57]:
random_state = random.randint(0, 300)
random_state

263

## Random Forest

In [58]:
def evaluate_and_save_random_forest(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='random_forest_'):
    # Default parameters if none are provided
    if params is None:
    # Parameters for the Random Forest
        params = OrderedDict(
            [('n_estimators', 59), 
            ('max_depth', 20), 
            ('min_samples_split', 10)])

    # Create the Random Forest model with the specified parameters
    rf_model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        random_state=42
    )
    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(rf_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }
    
    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_random_forest(X, y, random_state=random_state)


Cross-validation scores:  [0.63035019 0.63865546 0.56854839 0.60869565 0.61445783 0.60251046
 0.68627451 0.61632653 0.61632653 0.57377049]
Mean cross-validation score:  0.6155916050416128
Results saved to random_forest_2024-05-29_15-41-01.json


## XGBoost Random Forest

In [59]:

def evaluate_and_save_xgbrf(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='xgbrf_'):
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict(
            [('learning_rate', 0.9149030789798307), 
             ('max_depth', 17), 
             ('n_estimators', 80), 
             ('subsample', 0.5)]
        )
    
    # Create the XGBoost Random Forest model with the specified parameters
    xgbrf_model = XGBRFClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )
    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(xgbrf_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }
    
    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_xgbrf(X, y, random_state=random_state)


Cross-validation scores:  [0.6381323  0.65546218 0.59677419 0.60507246 0.58232932 0.61506276
 0.65490196 0.62040816 0.62040816 0.58196721]
Mean cross-validation score:  0.617051871711533
Results saved to xgbrf_2024-05-29_15-41-09.json


## XGBoost

In [60]:
def evaluate_and_save_xgboost(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='xgboost_'):
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict(
            [('learning_rate', 0.005169944690299296), 
            ('max_depth', 40), 
            ('n_estimators', 100), 
            ('subsample', 0.1)]
        )

# Create the Random Forest model with the specified parameters
    xgb_model = XGBClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )

    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(xgb_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }
    
    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_xgboost(X, y, random_state=random_state)

Cross-validation scores:  [0.70428016 0.68487395 0.63709677 0.63405797 0.65863454 0.67782427
 0.73333333 0.67346939 0.68979592 0.63114754]
Mean cross-validation score:  0.6724513836804322
Results saved to xgboost_2024-05-29_15-41-15.json


## **Trash but maybe useful later**

## General

In [45]:
import json
import os
from datetime import datetime
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

def evaluate_and_save_model(X, y, model_class, params=None, n_splits=10, random_state=42, scoring='recall', filename_prefix='model_'):

    # Create the model with the specified parameters
    model = model_class(**params) if params else model_class()
    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    # Prepare results to save
    results = {
        'model_class': model_class.__name__,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }
    
    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and time
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename_prefix + current_time + '.json'
    
    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to results_best_models/{filename}")

# Example usage:
# from sklearn.ensemble import RandomForestClassifier
# params = OrderedDict([
#     ('n_estimators', 100), 
#     ('max_depth', 10), 
#     ('min_samples_split', 5)
# ])
# evaluate_and_save_model(X, y, RandomForestClassifier, params)
