In [13]:
from collections import OrderedDict
from datetime import datetime
import json
import os

from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier, XGBClassifier
import numpy as np

from tools import calculate_money


### Data

In [40]:
# Create a synthetic dataset
# with open('./reduced_data/X_boruta_cfs.pickle', 'rb') as handle:
#     X = pickle.load(handle)

columns = [103, 105]
X = np.loadtxt("../data/x_train.txt", delimiter=' ')
X = X[:, columns]
y = np.loadtxt("../data/y_train.txt", delimiter=' ')

folder_name = 'money_results_best_models/'


## Params

In [41]:
# random_state = random.randint(0, 300)
random_state = 145
random_state


145

## Random Forest

In [42]:
def evaluate_and_save_random_forest(
        X, y, params=None, n_splits=10, random_state=42,
        scoring='recall', filename='random_forest_'
        ) -> None:

    # Default parameters if none are provided
    if params is None:
    # Parameters for the Random Forest
        params = OrderedDict(
            [('n_estimators', 59), 
            ('max_depth', 20), 
            ('min_samples_split', 10)])

    # Create the Random Forest model with the specified parameters
    rf_model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        random_state=42
    )

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(rf_model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    money = calculate_money(columns, n=10, model_name="rf", model_params=params)

    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money,
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")


# Example usage:
evaluate_and_save_random_forest(X, y, random_state=random_state)


Cross-validation scores:  [0.54       0.5        0.45967742 0.52692308 0.49212598 0.47560976
 0.51219512 0.51792829 0.50188679 0.50413223]
Mean cross-validation score:  0.5030478669289044
Money: 6015.0
Results saved to random_forest_2024-05-31_11-53-54.json


## XGBoost Random Forest

In [43]:
def evaluate_and_save_xgbrf(
        X, y, params=None, n_splits=10, random_state=42,
        scoring='recall', filename='xgbrf_'
        ) -> None:
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict(
            [('learning_rate', 0.9149030789798307), 
             ('max_depth', 17), 
             ('n_estimators', 80), 
             ('subsample', 0.5)]
        )
    
    # Create the XGBoost Random Forest model with the specified parameters
    xgbrf_model = XGBRFClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(xgbrf_model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    money = calculate_money(columns, n=10, model_name="xgbrf", model_params=params)

    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money
    }

    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'

    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)

    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")


# Example usage:
evaluate_and_save_xgbrf(X, y, random_state=random_state)


Cross-validation scores:  [0.56       0.53418803 0.49596774 0.57307692 0.49606299 0.50813008
 0.53252033 0.5498008  0.51320755 0.51239669]
Mean cross-validation score:  0.5275351136027926
Money: 6300.0
Results saved to xgbrf_2024-05-31_11-53-59.json


## XGBoost

In [44]:
def evaluate_and_save_xgboost(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='xgboost_'):
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict(
            [('learning_rate', 0.005169944690299296), 
            ('max_depth', 40), 
            ('n_estimators', 100), 
            ('subsample', 0.1)]
        )

    # Create the Random Forest model with the specified parameters
    xgb_model = XGBClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )

    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(xgb_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    money = calculate_money(columns, n=10, model_name="xgb", model_params=params)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")


# Example usage:
evaluate_and_save_xgboost(X, y, random_state=random_state)


Cross-validation scores:  [0.536      0.55555556 0.44354839 0.53461538 0.51968504 0.52845528
 0.5        0.5498008  0.49433962 0.57438017]
Mean cross-validation score:  0.5236380235934153
Money: 6635.0
Results saved to xgboost_2024-05-31_11-54-02.json


## **Trash but maybe useful later**

## General

In [19]:
def evaluate_and_save_model(
        X, y, model_class,
        params=None, n_splits=10, random_state=42,
        scoring='recall', filename_prefix='model_'
        ) -> None:

    # Create the model with the specified parameters
    model = model_class(**params) if params else model_class()

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    # Prepare results to save
    results = {
        'model_class': model_class.__name__,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }

    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and time
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename_prefix + current_time + '.json'

    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)

    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to results_best_models/{filename}")


# Example usage:
# from sklearn.ensemble import RandomForestClassifier
# params = OrderedDict([
#     ('n_estimators', 100), 
#     ('max_depth', 10), 
#     ('min_samples_split', 5)
# ])
# evaluate_and_save_model(X, y, RandomForestClassifier, params)
