In [1]:


# this section will generate the search objectives
data_folder = r"C:\Users\gac8\PycharmProjects\PSSearch\data\retail_forecasting"

In [8]:
import copy
import os
from typing import Literal
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from Core.PRef import PRef
from PolishSystem.read_data import get_vectors_file_name, get_pRef_from_vectors, get_fitness_file_name


def get_data_from_settings(genome_size: int, cluster_method: str, fitness_column: int, which_split: Literal["train", "test"]):
    fitness_file_name = os.path.join(data_folder, f"{which_split}_fitness_{genome_size}_{cluster_method}.csv")
    vectors_file_name = os.path.join(data_folder, f"{which_split}_many_hot_vectors_{genome_size}_{cluster_method}.csv")
    pRef = get_pRef_from_vectors(name_of_vectors_file=vectors_file_name, 
                                 name_of_fitness_file=fitness_file_name, 
                                 column_in_fitness_file=fitness_column)
    return pRef.full_solution_matrix, pRef.fitness_array


def train_and_test_model_on_settings(model, genome_size: int, cluster_method: str, fitness_column: int):
    temp_model = copy.deepcopy(model)
    print(f"Testing {model}, {genome_size = }, {cluster_method}, {fitness_column}")
    
    try:
        train_X, train_y = get_data_from_settings(genome_size, cluster_method, fitness_column, "train")
        test_X, test_y = get_data_from_settings(genome_size, cluster_method, fitness_column, "test")
    except Exception as e:
        print(f"File for {(genome_size, cluster_method, fitness_column)} was skipped")
        return
    
    temp_model.fit(train_X, train_y)
    
    y_pred = temp_model.predict(test_X)
    r2 = r2_score(test_y, y_pred)  # R^2 score
    mse = mean_squared_error(test_y, y_pred)  # Mean Squared Error
    mae = mean_absolute_error(test_y, y_pred)  # Mean Absolute Error

    print(f"R^2 score: {r2:.3f}")
    print(f"MSE: {mse:.3f}")
    print(f"MAE: {mae:.3f}")

    scores = {
        "R2": r2,
        "MSE": mse,
        "MAE": mae,
    }
    return scores

    


In [9]:
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
import pandas as pd

models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000)
]

rows = [train_and_test_model_on_settings(model, genome_size, clustering_method, fitness_column)
        for genome_size in [20, 50, 100, 250, 500, 1000]
        for clustering_method in ["random", "kmeans", "qmc"]
        for fitness_column in [0, 1, 2, 3, 4, 5]
        for model in models]


Testing LinearRegression(), genome_size = 20, random, 0
R^2 score: 0.017
MSE: 0.000
MAE: 0.006
Testing DecisionTreeRegressor(), genome_size = 20, random, 0
R^2 score: -1.481
MSE: 0.001
MAE: 0.008
Testing MLPRegressor(max_iter=1000), genome_size = 20, random, 0
R^2 score: -1.460
MSE: 0.001
MAE: 0.017
Testing LinearRegression(), genome_size = 20, random, 1
R^2 score: 0.006
MSE: 0.000
MAE: 0.007
Testing DecisionTreeRegressor(), genome_size = 20, random, 1
R^2 score: -1.127
MSE: 0.001
MAE: 0.009
Testing MLPRegressor(max_iter=1000), genome_size = 20, random, 1
R^2 score: -0.299
MSE: 0.001
MAE: 0.012
Testing LinearRegression(), genome_size = 20, random, 2
R^2 score: 0.187
MSE: 5.501
MAE: 1.801
Testing DecisionTreeRegressor(), genome_size = 20, random, 2
R^2 score: -0.274
MSE: 8.619
MAE: 2.274
Testing MLPRegressor(max_iter=1000), genome_size = 20, random, 2
R^2 score: 0.244
MSE: 5.115
MAE: 1.732
Testing LinearRegression(), genome_size = 20, random, 3
R^2 score: 0.058
MSE: 0.029
MAE: 0.127
Tes