In [14]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
from collections import OrderedDict
import pickle
import json
from collections import OrderedDict
from xgboost import XGBRFClassifier
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
import os
import json
from datetime import datetime
import random


### Data

In [15]:

# Create a synthetic dataset
# with open('./reduced_data/X_boruta_cfs.pickle', 'rb') as handle:
#     X = pickle.load(handle)

columns = [101, 102, 103, 105]
X = np.loadtxt("../data/x_train.txt", delimiter=' ')
X = X[:, columns]
y = np.loadtxt("../data/y_train.txt", delimiter=' ')

## params

In [16]:
# random_state = random.randint(0, 300)
random_state = 145
random_state

145

## Random Forest

In [17]:
def evaluate_and_save_random_forest(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='random_forest_'):
    # Default parameters if none are provided
    if params is None:
    # Parameters for the Random Forest
        params = OrderedDict(
            [('n_estimators', 59), 
            ('max_depth', 20), 
            ('min_samples_split', 10)])

    # Create the Random Forest model with the specified parameters
    rf_model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        random_state=42
    )
    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(rf_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }
    
    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_random_forest(X, y, random_state=random_state)


Cross-validation scores:  [0.628      0.61965812 0.60887097 0.68076923 0.61811024 0.61788618
 0.62601626 0.5936255  0.64150943 0.66528926]
Mean cross-validation score:  0.629973518158273
Results saved to random_forest_2024-05-29_16-08-06.json


## XGBoost Random Forest

In [18]:

def evaluate_and_save_xgbrf(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='xgbrf_'):
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict(
            [('learning_rate', 0.9149030789798307), 
             ('max_depth', 17), 
             ('n_estimators', 80), 
             ('subsample', 0.5)]
        )
    
    # Create the XGBoost Random Forest model with the specified parameters
    xgbrf_model = XGBRFClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )
    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(xgbrf_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }
    
    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_xgbrf(X, y, random_state=random_state)


Cross-validation scores:  [0.636      0.61538462 0.625      0.63461538 0.61811024 0.62601626
 0.62601626 0.59760956 0.63396226 0.64049587]
Mean cross-validation score:  0.6253210450218203
Results saved to xgbrf_2024-05-29_16-08-12.json


## XGBoost

In [19]:
def evaluate_and_save_xgboost(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='xgboost_'):
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict(
            [('learning_rate', 0.005169944690299296), 
            ('max_depth', 40), 
            ('n_estimators', 100), 
            ('subsample', 0.1)]
        )

# Create the Random Forest model with the specified parameters
    xgb_model = XGBClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )

    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(xgb_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }
    
    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_xgboost(X, y, random_state=random_state)

Cross-validation scores:  [0.684      0.64957265 0.64112903 0.69615385 0.66141732 0.66260163
 0.66260163 0.64143426 0.64150943 0.70661157]
Mean cross-validation score:  0.6647031370010132
Results saved to xgboost_2024-05-29_16-08-15.json


## **Trash but maybe useful later**

## General

In [20]:
import json
import os
from datetime import datetime
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

def evaluate_and_save_model(X, y, model_class, params=None, n_splits=10, random_state=42, scoring='recall', filename_prefix='model_'):

    # Create the model with the specified parameters
    model = model_class(**params) if params else model_class()
    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    # Prepare results to save
    results = {
        'model_class': model_class.__name__,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }
    
    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and time
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename_prefix + current_time + '.json'
    
    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to results_best_models/{filename}")

# Example usage:
# from sklearn.ensemble import RandomForestClassifier
# params = OrderedDict([
#     ('n_estimators', 100), 
#     ('max_depth', 10), 
#     ('min_samples_split', 5)
# ])
# evaluate_and_save_model(X, y, RandomForestClassifier, params)
