In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
from collections import OrderedDict
import pickle
import json
from collections import OrderedDict
from xgboost import XGBRFClassifier
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
import os
import json
from datetime import datetime
import random
from tools import calculate_money


### Data

In [2]:

# Create a synthetic dataset
# with open('./reduced_data/X_boruta_cfs.pickle', 'rb') as handle:
#     X = pickle.load(handle)

columns = [100, 102, 103, 105]
X = np.loadtxt("../data/x_train.txt", delimiter=' ')
X = X[:, columns]
y = np.loadtxt("../data/y_train.txt", delimiter=' ')

folder_name = 'money_results_best_models/'

## Params

In [3]:
# random_state = random.randint(0, 300)
random_state = 145
random_state

145

## Random Forest

In [4]:
def evaluate_and_save_random_forest(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='random_forest_'):
    # Default parameters if none are provided
    if params is None:
    # Parameters for the Random Forest
        params = OrderedDict(
            [('n_estimators', 59), 
            ('max_depth', 20), 
            ('min_samples_split', 10)])

    # Create the Random Forest model with the specified parameters
    rf_model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        random_state=42
    )
    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(rf_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    money = calculate_money(rf_model, columns, n=10)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money,
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_random_forest(X, y, random_state=random_state)


[0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1.
 1. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1.
 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0.
 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0.
 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0.
 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1.
 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1.
 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0.
 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1.
 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 0.
 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1.

## XGBoost Random Forest

In [5]:

def evaluate_and_save_xgbrf(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='xgbrf_'):
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict(
            [('learning_rate', 0.9149030789798307), 
             ('max_depth', 17), 
             ('n_estimators', 80), 
             ('subsample', 0.5)]
        )
    
    # Create the XGBoost Random Forest model with the specified parameters
    xgbrf_model = XGBRFClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )
    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(xgbrf_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    money = calculate_money(xgbrf_model, columns, n=10)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_xgbrf(X, y, random_state=random_state)


y_proba
[0.6608063  0.29584384 0.28875807 0.42672467 0.36233154 0.4132522
 0.6528332  0.42218253 0.6194069  0.3563013  0.3542979  0.792888
 0.58032465 0.7850148  0.45602927 0.6601732  0.2729273  0.5818846
 0.3296089  0.6480416  0.47601983 0.58276653 0.7244759  0.55026144
 0.2541578  0.732472   0.71096915 0.6654182  0.3476289  0.38802597
 0.32119521 0.51808596 0.65946    0.26493317 0.7644291  0.28488624
 0.6127307  0.2661549  0.5192283  0.30843356 0.27619988 0.76968366
 0.42749992 0.6356851  0.72817457 0.5706257  0.3725658  0.61405647
 0.8378082  0.5914718  0.40378582 0.72372884 0.4176774  0.5240098
 0.5374654  0.70404965 0.5672734  0.68842643 0.7196231  0.3638316
 0.8227465  0.59196514 0.5310964  0.75609213 0.524921   0.66961956
 0.36286777 0.5118962  0.81473005 0.5799849  0.27784327 0.61450803
 0.40389046 0.58749795 0.6650997  0.33246073 0.773109   0.38331383
 0.3800009  0.73263943 0.25223333 0.33578077 0.25814453 0.48889026
 0.28931722 0.7318882  0.31548366 0.57798254 0.3334757  0.73

## XGBoost

In [6]:
def evaluate_and_save_xgboost(X, y, params=None, n_splits=10, random_state=42, scoring='recall', filename='xgboost_'):
    # Default parameters if none are provided
    if params is None:
        params = OrderedDict(
            [('learning_rate', 0.005169944690299296), 
            ('max_depth', 40), 
            ('n_estimators', 100), 
            ('subsample', 0.1)]
        )

# Create the Random Forest model with the specified parameters
    xgb_model = XGBClassifier(
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        use_label_encoder=False, 
        eval_metric='logloss'
    )

    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(xgb_model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    money = calculate_money(xgb_model, columns, n=10)
    
    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money
    }
    
    # Create results directory if it doesn't exist
    os.makedirs(folder_name, exist_ok=True)

    # Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename + current_time + '.json'
    
    # Save results to file
    with open(folder_name + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    print(f"Results saved to {filename}")

# Example usage:
evaluate_and_save_xgboost(X, y, random_state=random_state)

y_proba
[0.5296333  0.4148918  0.40745974 0.47550482 0.4139293  0.47311231
 0.5454882  0.48100138 0.48119408 0.46391687 0.48665097 0.5354808
 0.531317   0.56661564 0.47707662 0.545686   0.44506103 0.5509487
 0.45484233 0.54298323 0.4724339  0.4961353  0.5655027  0.50470775
 0.44565782 0.53278005 0.577761   0.5396384  0.46376872 0.46751878
 0.43946353 0.4979996  0.5624369  0.47492382 0.55047107 0.41695082
 0.54428977 0.43343556 0.49804205 0.4237829  0.4098274  0.56307214
 0.47570312 0.5076433  0.53168434 0.50706774 0.45768246 0.5187298
 0.5856733  0.5534333  0.47992176 0.54848814 0.47851875 0.48454428
 0.5034532  0.5689137  0.53272897 0.5367908  0.5648387  0.45699173
 0.60428745 0.5178507  0.54253924 0.57746947 0.5451658  0.5249008
 0.41371286 0.5359841  0.6049209  0.50542957 0.34882948 0.53150094
 0.4895306  0.5010958  0.55580086 0.40903118 0.55715245 0.43214792
 0.52965724 0.5458681  0.3576304  0.45206034 0.36930698 0.47725433
 0.4146497  0.53690696 0.4487526  0.5091533  0.4348758  0.

## **Trash but maybe useful later**

## General

In [7]:
import json
import os
from datetime import datetime
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

def evaluate_and_save_model(X, y, model_class, params=None, n_splits=10, random_state=42, scoring='recall', filename_prefix='model_'):

    # Create the model with the specified parameters
    model = model_class(**params) if params else model_class()
    
    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=kf, scoring=scoring)
    
    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)
    
    # Prepare results to save
    results = {
        'model_class': model_class.__name__,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score
    }
    
    # Create results directory if it doesn't exist
    os.makedirs('results_best_models', exist_ok=True)

    # Get current date and time
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    filename = filename_prefix + current_time + '.json'
    
    # Save results to file
    with open('results_best_models/' + filename, 'w') as file:
        json.dump(results, file, indent=4)
    
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Results saved to results_best_models/{filename}")

# Example usage:
# from sklearn.ensemble import RandomForestClassifier
# params = OrderedDict([
#     ('n_estimators', 100), 
#     ('max_depth', 10), 
#     ('min_samples_split', 5)
# ])
# evaluate_and_save_model(X, y, RandomForestClassifier, params)
