In [14]:
from collections import OrderedDict
from datetime import datetime
import json
import os

from sklearn.model_selection import KFold, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np

from tools import calculate_money
import itertools

In [15]:
X = np.loadtxt("../data/x_train.txt", delimiter=' ')
y = np.loadtxt("../data/y_train.txt", delimiter=' ')
X_test = np.loadtxt("../data/x_train.txt", delimiter=' ')

folder_name = 'money_results_best_models/'
random_state = 145

In [22]:
def evaluate_and_save_naive_bayes(
        X, y, X_test, columns, params=None, n_splits=10, random_state=42,
        scoring='recall', filename='nb_', folder_name='best results/'
        ):

    # Default parameters if none are provided
    if params is None:  
        params = OrderedDict([('activation', 'tanh'), ('alpha', 0.007082715436049561), ('hidden_layer_sizes', 98), ('learning_rate_init', 0.0221443719469483), ('solver', 'sgd')])

    # Create the Naive Bayes model
#     nb_model =  MLPClassifier(
#         activation=params['activation'],
#         alpha=params['alpha'],
#         hidden_layer_sizes=params['hidden_layer_sizes'],
#         learning_rate_init=params['learning_rate_init'],
#         solver=params['solver'],
#         random_state=42)
    nb_model = GaussianNB()

    # Set up k-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Perform cross-validation
    cv_scores = cross_val_score(nb_model, X, y, cv=kf, scoring=scoring)

    # Calculate mean cross-validation score
    mean_cv_score = np.mean(cv_scores)

    # Assuming `calculate_money` function exists and takes similar parameters as before
    money = calculate_money(columns, n=1000, model_name="nb", model_params=params)

    # Prepare results to save
    results = {
        'columns': columns,
        'params': params,
        'n_splits': n_splits,
        'random_state': random_state,
        'scoring': scoring,
        'cv_scores': cv_scores.tolist(),
        'mean_cv_score': mean_cv_score,
        'money': money,
    }
    
    # Create results directory if it doesn't exist
#     os.makedirs(folder_name, exist_ok=True)

#     #Get current date and hour
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

#     filename = filename + current_time + '.json'

    nb_model.fit(X, y)
    y_prob = nb_model.predict_proba(X_test)[:, 1]

    # Get indexes of the 1000 most likely observations to be classified as 1
    most_likely_indexes = np.argsort(y_prob)[-1000:]

    # Convert to 1-based indexing
    most_likely_indexes_1_based = most_likely_indexes + 1

    # Save the indexes to a .txt file
    predictions_filename = 'nb_predictions_' + current_time + '.txt'
    predictions_filepath = os.path.join(folder_name, predictions_filename)
    
    with open(predictions_filepath, 'w') as pred_file:
        for index in most_likely_indexes_1_based:
            pred_file.write(f"{index}\n")
            
    columns_1_based = [i + 1 for i in columns]
    columns_filename = 'nb_columns_' + current_time + '.txt'
    columns_filepath = os.path.join(folder_name, columns_filename)
    
    with open(columns_filepath, 'w') as col_file:
        for col in columns_1_based:
            col_file.write(f"{col}\n")
    
    #Save results to file
    #with open(folder_name + filename, 'w') as file:
       #json.dump(results, file, indent=4)
    print("Cross-validation scores: ", cv_scores)
    print("Mean cross-validation score: ", mean_cv_score)
    print(f"Money: {money}")
    #print(f"Results saved to {filename}")

In [33]:
columns = [101, 102, 103, 105]
X_temp = X[:, cols] 
X_test_temp = X_test[:, cols] 
evaluate_and_save_naive_bayes(X_temp, y, X_test_temp, cols, random_state=145)

Cross-validation scores:  [0.564      0.60683761 0.54032258 0.60769231 0.58661417 0.58943089
 0.58536585 0.54581673 0.59245283 0.61157025]
Mean cross-validation score:  0.5830103227561195
Money: 7017.8
