In [5]:


# this section will generate the search objectives
data_folder = r"C:\Users\gac8\PycharmProjects\PSSearch\data\retail_forecasting"

In [6]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from Core.PRef import PRef
from PolishSystem.read_data import get_vectors_file_name, get_pRef_from_vectors, get_fitness_file_name

def make_decision_tree(train_pRef, max_depth=5, min_samples_leaf=10):
    # Extract the training data
    X = train_pRef.full_solution_matrix
    y = train_pRef.fitness_array

    # Initialize the regression tree with given parameters
    tree = DecisionTreeRegressor(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
    
    # Fit the tree to the data
    tree.fit(X, y)
    
    return tree


def evaluate_decision_tree(decision_tree: DecisionTreeRegressor, test_pRef: PRef) -> dict[str, float]:
    # Extract the training data
    X = test_pRef.full_solution_matrix
    y = test_pRef.fitness_array
    
    y_pred = decision_tree.predict(X)

    # Compute various scores
    r2 = r2_score(y, y_pred)  # R^2 score
    mse = mean_squared_error(y, y_pred)  # Mean Squared Error
    mae = mean_absolute_error(y, y_pred)  # Mean Absolute Error

    print(f"Training R^2 score: {r2:.3f}")
    print(f"Training MSE: {mse:.3f}")
    print(f"Training MAE: {mae:.3f}")

    scores = {
        "R2": r2,
        "MSE": mse,
        "MAE": mae,
    }
    return scores



def get_row_of_data(vector_size, clustering_method, fitness_column) -> dict:
    print(f"Getting data for {vector_size = }, {clustering_method = }, {fitness_column =}")
    pRef = get_pRef_from_vectors(get_vectors_file_name(data_folder, vector_size, clustering_method),
             get_fitness_file_name(data_folder, vector_size, clustering_method), fitness_column)
    train_pRef, test_pRef = pRef.train_test_split(test_size=0.2)
    tree = make_decision_tree(train_pRef, max_depth=10, min_samples_leaf=10)
    results =  evaluate_decision_tree(tree, test_pRef)
    settings = {"vector_size": vector_size,
                "clustering_method": clustering_method,
                "fitness_column": fitness_column}
    
    return results | settings



    


In [7]:
import pandas as pd

rows = [get_row_of_data(vector_size, clustering_method, fitness_column)
        for vector_size in [20, 50, 100]
        for clustering_method in ["random", "kmeans", "qmc"]
        for fitness_column in [0, 1, 2]]


df = pd.DataFrame(rows)

display(df)

Getting data for vector_size = 20, clustering_method = 'random', fitness_column =0
Training R^2 score: -0.020
Training MSE: 0.036
Training MAE: 0.137
Getting data for vector_size = 20, clustering_method = 'random', fitness_column =1
Training R^2 score: -0.038
Training MSE: 0.000
Training MAE: 0.001
Getting data for vector_size = 20, clustering_method = 'random', fitness_column =2
Training R^2 score: -0.016
Training MSE: 6.817
Training MAE: 2.084
Getting data for vector_size = 20, clustering_method = 'kmeans', fitness_column =0
Training R^2 score: -0.015
Training MSE: 0.035
Training MAE: 0.135
Getting data for vector_size = 20, clustering_method = 'kmeans', fitness_column =1
Training R^2 score: -0.038
Training MSE: 0.000
Training MAE: 0.001
Getting data for vector_size = 20, clustering_method = 'kmeans', fitness_column =2
Training R^2 score: 0.006
Training MSE: 6.695
Training MAE: 2.064
Getting data for vector_size = 20, clustering_method = 'qmc', fitness_column =0
Training R^2 score: -

Unnamed: 0,R2,MSE,MAE,vector_size,clustering_method,fitness_column
0,-0.020347,0.036167,0.13749,20,random,0
1,-0.037803,1e-05,0.000791,20,random,1
2,-0.015682,6.816919,2.084344,20,random,2
3,-0.015346,0.035182,0.135173,20,kmeans,0
4,-0.037535,1.2e-05,0.000793,20,kmeans,1
5,0.005752,6.694829,2.064282,20,kmeans,2
6,-0.017231,0.033988,0.134367,20,qmc,0
7,-0.009253,6.3e-05,0.000903,20,qmc,1
8,-0.014242,6.953765,2.112838,20,qmc,2
9,-0.00836,0.034118,0.133647,50,random,0


In [15]:
display(df[df["fitness_column"]==2])

Unnamed: 0,R2,MSE,MAE,vector_size,clustering_method,fitness_column
2,-0.015682,6.816919,2.084344,20,random,2
5,0.005752,6.694829,2.064282,20,kmeans,2
8,-0.014242,6.953765,2.112838,20,qmc,2
11,0.001583,6.768639,2.073546,50,random,2
14,0.03368,6.570479,2.038932,50,kmeans,2
17,-0.017816,6.91806,2.106791,50,qmc,2
20,0.015901,6.81471,2.069785,100,random,2
23,0.028071,6.399541,2.008212,100,kmeans,2
26,-0.022427,6.915197,2.096189,100,qmc,2
