In [20]:
from Core.PSMetric.FitnessQuality.SplitVariance import SplitVariance
from Core.PSMetric.SampleCount import SampleCount
from Core.PSMetric.FitnessQuality.SignificantlyHighAverage import MannWhitneyU
from Core.PSMetric.FitnessQuality.MeanFitness import MeanFitness
from Core.PSMetric.Simplicity import Simplicity
from Core.PSMetric.Linkage.ValueSpecificMutualInformation import FasterSolutionSpecificMutualInformation
from Core.PRef import PRef
from Core.FullSolution import FullSolution
from SimplifiedSystem.ps_search_utils import get_metric_function

def want_high(objective):
    def opposite(ps):
        return -objective(ps)

    return opposite

def want_low(objective):
    return objective

def metric_string_to_func(metric_string: str, pRef: PRef, linkage_evaluator: FasterSolutionSpecificMutualInformation):
    
    if metric_string == "simplicity":
        metric = Simplicity()
        return want_high(metric.get_single_score)
    if metric_string == "session_count":
        metric = SampleCount()
        metric.set_pRef(pRef)
        return want_high(metric.get_single_score)
    if metric_string == "mean_fitness":
        metric = MeanFitness()
        metric.set_pRef(pRef)
        return want_high(metric.get_single_score)
    if metric_string == "fitness_p_value":
        metric = MannWhitneyU()
        metric.set_pRef(pRef)
        return want_low(metric.get_single_score)
    
    if metric_string == "split_variance":
        metric = SplitVariance(selected_pRef=pRef)
        return want_low(metric.get_single_score)
    if metric_string == "atomicity":
        return want_high(linkage_evaluator.get_atomicity)
    if metric_string == "dependency":
        return want_low(linkage_evaluator.get_dependence)
    
    raise Exception(f"The metric {metric_string} not recognised")
    

In [21]:
from Core.PS import PS
import numpy as np
from Core.PRef import PRef
from Core.EvaluatedPS import EvaluatedPS

def get_quantile_of_mean_fitness(mean_fitness: float, pRef: PRef) -> float:
    """returns where the mean fitness lies in the distribution of fitnesses"""
    return np.sum(pRef.fitness_array <= mean_fitness) / pRef.sample_size
    
def ps_to_json(ps: EvaluatedPS, original_pRef: PRef):
    matched_sessions_fitnesses = original_pRef.fitnesses_of_observations(ps)
    mean_fitness = np.mean(matched_sessions_fitnesses)

    quantile = get_quantile_of_mean_fitness(mean_fitness, original_pRef)
        
    return {"pattern": repr(PS(ps.values)),
            "metrics": list(map(float,ps.metric_scores)), # might need to apply float to each
            "sample_count": len(matched_sessions_fitnesses),
            "mean_fitness": float(mean_fitness),
            "quartile_of_mean_fitness": float(quantile)
            }

In [24]:


from PolishSystem.read_data import get_pRef_from_vectors, get_vectors_file_name, get_fitness_file_name
from SimplifiedSystem.PSSearchSettings import get_default_search_settings, PSSearchSettings
from PolishSystem.polish_search_methods import search_local_polish_ps
import utils
from Core.PSMetric.Linkage.ValueSpecificMutualInformation import FasterSolutionSpecificMutualInformation
from SimplifiedSystem.ps_search_utils import apply_culling_method
from utils import announce

import json

data_folder = r"C:\Users\gac8\PycharmProjects\PSSearch\data\retail_forecasting"


def gather_data_for_settings(vector_sizes: list[int],
                             clustering_methods: list[str],
                             fitness_columns: list[int],
                             search_metrics_combinations: list[str],
                             qty_solutions_to_explain: int,
                             search_settings: PSSearchSettings,
                             output_file_name: str):
    results = []
    for vector_size in vector_sizes:
        for clustering_method in clustering_methods:
            for fitness_column in fitness_columns:
                pRef: PRef = get_pRef_from_vectors(get_vectors_file_name(data_folder, vector_size, clustering_method), 
                                         get_fitness_file_name(data_folder, vector_size, clustering_method), fitness_column)
                
                solutions_to_explain = [pRef.get_random_evaluated_fs() for _ in range(qty_solutions_to_explain)]
                
                for solution in solutions_to_explain:
                    with utils.announce(f"Calculating linkage information for {vector_size = }"):
                        linkage_evaluator = FasterSolutionSpecificMutualInformation()
                        linkage_evaluator.set_pRef(pRef)
                        linkage_evaluator.set_solution(solution)
                    for search_metrics in search_metrics_combinations:
                        search_metrics_as_functions = [metric_string_to_func(one_metric, pRef, linkage_evaluator) 
                                                        for one_metric in search_metrics.split()]
                        
                        def generate_datapoint():
                        
                            with announce(f"Searching for {vector_size = }, "
                                          f"{clustering_method = }, "
                                          f"{fitness_column = }, "
                                          f"{search_metrics =}"):
                                pss = search_local_polish_ps(solution_to_explain=solution,
                                                             search_settings=search_settings,
                                                             objectives = search_metrics_as_functions)
                                winning_ps = apply_culling_method(culling_method="elbow", pss=pss)[0]
                            
                            results.append({"vector_size": vector_size,
                                            "clustering_method": clustering_method,
                                            "fitness_column": fitness_column,
                                            "solution": repr(FullSolution(solution.values)),
                                            "solution_fitness": solution.fitness,
                                            "search_metrics": search_metrics,
                                            "resulting_pss": [ps_to_json(ps, pRef) for ps in pss],
                                            "winning_ps": ps_to_json(winning_ps, pRef)})
                        
                            
                        try:
                            generate_datapoint()
                        except Exception as e:
                            results.append({"error": True,
                                            "message": repr(e)})
    
    
    with utils.open_and_make_directories(output_file_name) as file:
        json.dump(results, file, indent=4)
        
    print(f"Saved the results to {output_file_name}")
                    
                    

In [None]:

vector_sizes = [20, 50, 100]
clustering_methods = ["qmc"]
fitness_columns = [0]
qty_solutions_to_explain = 12
search_metrics_combinations = ["simplicity mean_fitness atomicity", 
                               "session_count mean_fitness atomicity",  # simplicity   becomes session_count
                               "simplicity fitness_p_value atomicity",  # mean_fitness becomes fitness_p_value
                               "simplicity split_variance atomicity",   # mean_fitness becomes split_variance
                               "simplicity mean_fitness dependency",    # atomicity    becomes dependency
                               "mean_fitness atomicity dependency"      # removed mean_fitness, added dependency
                               ]

def get_file_destination_name(extra_info):
    return r"C:\Users\gac8\PycharmProjects\PSSearch\PolishSystem\DataCollection\results\pss"+extra_info+utils.get_formatted_timestamp()+".json"


search_settings = get_default_search_settings()
search_settings.metrics = None # just in case
search_settings.verbose = True # because it's annoying
search_settings.culling_method = None # because I want to capture the entire pareto front

for vector_size in vector_sizes:
    gather_data_for_settings(vector_sizes = [vector_size],
                             clustering_methods=clustering_methods,
                             fitness_columns=fitness_columns,
                             search_metrics_combinations=search_metrics_combinations,
                             search_settings=search_settings,
                             qty_solutions_to_explain=qty_solutions_to_explain,
                             output_file_name=get_file_destination_name(extra_info=f"vector_size_{vector_size}_")
                             )

n_gen  |  n_eval  | n_nds  |     cv_min    |     cv_avg    |      eps      |   indicator  
     1 |        4 |      1 |  0.000000E+00 |  0.000000E+00 |             - |             -
     2 |        4 |      1 |  0.000000E+00 |  0.000000E+00 |  0.000000E+00 |             f
The pss found in the search are 
	 [* * * * * * * * * * * * * * * * * * * *][-20.000, -0.876, 0.000, ]
...Finished (took 0.582144 seconds)
n_gen  |  n_eval  | n_nds  |     cv_min    |     cv_avg    |      eps      |   indicator  
     1 |        4 |      1 |  0.000000E+00 |  0.000000E+00 |             - |             -
     2 |        4 |      1 |  0.000000E+00 |  0.000000E+00 |  0.000000E+00 |             f
The pss found in the search are 
	 [* * * * * * * * * * * * * * * * * * * *][-52626.000, -0.876, 0.000, ]
...Finished (took 0.587852 seconds)
n_gen  |  n_eval  | n_nds  |     cv_min    |     cv_avg    |      eps      |   indicator  
     1 |        4 |      3 |  0.000000E+00 |  0.000000E+00 |             - |      