In [None]:
from collections import OrderedDict
from datetime import datetime
import json
import os

from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier, XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import numpy as np

from tools import calculate_money


### Data

In [2]:
# Create a synthetic dataset
# with open('./reduced_data/X_boruta_cfs.pickle', 'rb') as handle:
#     X = pickle.load(handle)

columns = [101, 102, 103]
X = np.loadtxt("../data/x_train.txt", delimiter=' ')
X = X[:, columns]
y = np.loadtxt("../data/y_train.txt", delimiter=' ')

folder_name = 'money_results_best_models/'


## Params

In [3]:
# random_state = random.randint(0, 300)
random_state = 145
random_state


145

## Random Forest

In [19]:
def evaluate_and_save_random_forest(
        X, y, params=None, n_splits=10, random_state=42,
        scoring='recall', filename='random_forest_'
        ) -> None:

    # Default parameters if none are provided
    if params is None:
    # Parameters for the Random Forest
        params = OrderedDict([('bootstrap', True), ('max_depth', 30), ('max_features', 0.999), ('min_samples_leaf', 1), ('min_samples_split', 10), ('n_estimators', 1000)])

    # Create the Random Forest model with the specified parameters
    rf_model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        random_state=42
    )

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(rf_model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    money = calculate_money(columns, n=10, model_name="rf", model_params=params)

    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money,
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")


# Example usage:
evaluate_and_save_random_forest(X, y, random_state=random_state)


Cross-validation scores:  [0.556      0.5982906  0.57258065 0.60769231 0.52755906 0.6097561
 0.55284553 0.54183267 0.56603774 0.64049587]
Mean cross-validation score:  0.5773090505218927
Money: 6390.0
Results saved to random_forest_2024-05-31_13-31-41.json


## Naive Bayes

In [12]:
def evaluate_and_save_naive_bayes(
        X, y, params=None, n_splits=10, random_state=42,
        scoring='recall', filename='naive_bayes_', folder_name='results/'
        ) -> None:

    # Default parameters if none are provided
    if params is None:
        params = OrderedDict()

    # Create the Naive Bayes model
    nb_model = GaussianNB()

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(nb_model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    # Assuming `calculate_money` function exists and takes similar parameters as before
    money = calculate_money(columns, n=10, model_name="nb", model_params=params)

    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money,
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_naive_bayes(X, y, random_state=random_state)

Cross-validation scores:  [0.532      0.56410256 0.55241935 0.59230769 0.53149606 0.56097561
 0.55284553 0.47808765 0.53584906 0.59504132]
Mean cross-validation score:  0.5495124840772687
Money: 6920.0
Results saved to naive_bayes_2024-05-31_13-35-08.json


## Logistic Regression

In [16]:

def evaluate_and_save_logistic_regression(
        X, y, params=None, n_splits=10, random_state=42,
        scoring='recall', filename='logistic_regression_', folder_name='results/'
        ) -> None:

    # Default parameters if none are provided
    if params is None:
        # Default parameters for Logistic Regression
        params = OrderedDict(
            [('penalty', 'l2'),
            ('C', 1.0),
            ('solver', 'lbfgs')]
        )

    # Create the Logistic Regression model with the specified parameters
    lr_model = LogisticRegression(
        penalty=params['penalty'],
        C=params['C'],
        solver=params['solver'],
        random_state=random_state
    )

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(lr_model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    # Assuming `calculate_money` function exists and takes similar parameters as before
    money = calculate_money(columns, n=10, model_name="logistic_regression", model_params=params)

    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money,
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_logistic_regression(X, y, random_state=random_state)

Cross-validation scores:  [0.492      0.57692308 0.46370968 0.41923077 0.45669291 0.46747967
 0.48780488 0.46613546 0.34716981 0.52066116]
Mean cross-validation score:  0.4697807416317435
Money: 5145.0
Results saved to logistic_regression_2024-05-31_13-13-00.json


## XGBoost Random Forest

In [17]:
def evaluate_and_save_xgbrf(
        X, y, params=None, n_splits=10, random_state=42,
        scoring='recall', filename='xgbrf_'
        ) -> None:
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict(
            [('learning_rate', 0.9149030789798307), 
             ('max_depth', 17), 
             ('n_estimators', 80), 
             ('subsample', 0.5)]
        )
    
    # Create the XGBoost Random Forest model with the specified parameters
    xgbrf_model = XGBRFClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(xgbrf_model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    money = calculate_money(columns, n=10, model_name="xgbrf", model_params=params)

    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money
    }

    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'

    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)

    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")


# Example usage:
evaluate_and_save_xgbrf(X, y, random_state=random_state)


Cross-validation scores:  [0.548      0.54700855 0.56854839 0.59230769 0.52362205 0.58536585
 0.53658537 0.51394422 0.53962264 0.58677686]
Mean cross-validation score:  0.554178161729044
Money: 6640.0
Results saved to xgbrf_2024-05-31_13-13-05.json


## XGBoost

In [13]:
def evaluate_and_save_xgboost(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='xgboost_'):
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict([('learning_rate', 0.007017207161446775), ('max_depth', 50), ('n_estimators', 100), ('subsample', 0.1)])

    # Create the Random Forest model with the specified parameters
    xgb_model = XGBClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )

    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(xgb_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    money = calculate_money(columns, n=10, model_name="xgb", model_params=params)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")


# Example usage:
evaluate_and_save_xgboost(X, y, random_state=random_state)


Cross-validation scores:  [0.568      0.58974359 0.60887097 0.62307692 0.57874016 0.61788618
 0.57723577 0.55776892 0.57735849 0.67768595]
Mean cross-validation score:  0.5976366954544325
Money: 6440.0
Results saved to xgboost_2024-05-31_13-35-17.json


# Multilayer perceptron 

In [None]:
def evaluate_and_save_mlp(
        X, y, params=None, n_splits=10, random_state=42,
        scoring='recall', filename='mlp_', folder_name='money_results_best_models/'
        ):

    # Default parameters if none are provided
    if params is None:  
        params = OrderedDict([('activation', 'tanh'), ('alpha', 0.007082715436049561), ('hidden_layer_sizes', 98), ('learning_rate_init', 0.0221443719469483), ('solver', 'sgd')])

    # Create the MLP model
    nb_model =  MLPClassifier(
        activation=params['activation'],
        alpha=params['alpha'],
        hidden_layer_sizes=params['hidden_layer_sizes'],
        learning_rate_init=params['learning_rate_init'],
        solver=params['solver'],
        random_state=42)

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(nb_model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    # Assuming `calculate_money` function exists and takes similar parameters as before
    money = calculate_money(columns, n=50, model_name="mlp", model_params=params)

    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money,
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    #Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    #Save results to file
    with open(folder_name + filename, 'w') as file:
       json.dump(results, file, indent=4)
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")

## **Trash but maybe useful later**

## General

In [9]:
def evaluate_and_save_model(
        X, y, model_class,
        params=None, n_splits=10, random_state=42,
        scoring='recall', filename_prefix='model_'
        ) -> None:

    # Create the model with the specified parameters
    model = model_class(**params) if params else model_class()

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    # Prepare results to save
    results = {
        'model_class': model_class.__name__,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }

    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and time
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename_prefix + current_time + '.json'

    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)

    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to results_best_models/{filename}")


# Example usage:
# from sklearn.ensemble import RandomForestClassifier
# params = OrderedDict([
#     ('n_estimators', 100), 
#     ('max_depth', 10), 
#     ('min_samples_split', 5)
# ])
# evaluate_and_save_model(X, y, RandomForestClassifier, params)
