In [6]:
import os

os.chdir('C:\\Users\\300522\\Documents\\GitHub\\ECXAI-2025-Stirling\\PSSearch')

In [7]:
from Core.PSMetric.FitnessQuality.SplitVariance import SplitVariance
from Core.PSMetric.SampleCount import SampleCount
from Core.PSMetric.FitnessQuality.SignificantlyHighAverage import MannWhitneyU
from Core.PSMetric.FitnessQuality.MeanFitness import MeanFitness
from Core.PSMetric.Simplicity import Simplicity
from Core.PSMetric.Linkage.ValueSpecificMutualInformation import FasterSolutionSpecificMutualInformation, ValueSpecificMutualInformation
from Core.PRef import PRef
from Core.FullSolution import FullSolution
from SimplifiedSystem.ps_search_utils import get_metric_function

def want_high(objective):
    def opposite(ps):
        return -objective(ps)

    return opposite

def want_low(objective):
    return objective

def metric_string_to_func(metric_string: str, pRef: PRef, linkage_evaluator: ValueSpecificMutualInformation):
    
    if metric_string == "simplicity":
        metric = Simplicity()
        return want_high(metric.get_single_score)
    if metric_string == "session_count":
        metric = SampleCount()
        metric.set_pRef(pRef)
        return want_high(metric.get_single_score)
    if metric_string == "mean_fitness":
        metric = MeanFitness()
        metric.set_pRef(pRef)
        return want_high(metric.get_single_score)
    if metric_string == "fitness_p_value":
        metric = MannWhitneyU()
        metric.set_pRef(pRef)
        return want_low(metric.get_single_score)
    
    if metric_string == "split_variance":
        metric = SplitVariance(selected_pRef=pRef)
        return want_low(metric.get_single_score)
    if metric_string == "atomicity":
        return want_high(linkage_evaluator.get_atomicity)
    if metric_string == "dependency":
        return want_low(linkage_evaluator.get_dependence)
    
    raise Exception(f"The metric {metric_string} not recognised")
    

In [8]:
from Core.PS import PS
import numpy as np
from Core.PRef import PRef
from Core.EvaluatedPS import EvaluatedPS

def get_quantile_of_mean_fitness(mean_fitness: float, pRef: PRef) -> float:
    """returns where the mean fitness lies in the distribution of fitnesses"""
    return np.sum(pRef.fitness_array <= mean_fitness) / pRef.sample_size
    
def ps_to_json(ps: EvaluatedPS, original_pRef: PRef):
    matched_sessions_fitnesses = original_pRef.fitnesses_of_observations(ps)
    mean_fitness = np.mean(matched_sessions_fitnesses)

    quantile = get_quantile_of_mean_fitness(mean_fitness, original_pRef)
        
    return {"pattern": repr(PS(ps.values)),
            "metrics": list(map(float,ps.metric_scores)), # might need to apply float to each
            "sample_count": len(matched_sessions_fitnesses),
            "mean_fitness": float(mean_fitness),
            "quartile_of_mean_fitness": float(quantile)
            }

In [15]:


from PolishSystem.read_data import get_pRef_from_vectors, get_vectors_file_name, get_fitness_file_name
from SimplifiedSystem.PSSearchSettings import get_default_search_settings, PSSearchSettings
from PolishSystem.polish_search_methods import search_local_polish_ps, search_global_polish_ps
import utils_
from Core.PSMetric.Linkage.ValueSpecificMutualInformation import ValueSpecificMutualInformation
from SimplifiedSystem.ps_search_utils import apply_culling_method
from utils_ import announce

import json

data_folder = "C:\\Users\\300522\\Documents\\GitHub\\ECXAI-2025-Stirling\\LightGCN\\data\\amazon-book\\hierarchical_qmc"


def gather_data_for_settings(vector_sizes: list[int],
                             clustering_methods: list[str],
                             fitness_columns: list[int],
                             search_metrics_combinations: list[str],
                             qty_solutions_to_explain: int,
                             search_settings: PSSearchSettings,
                             output_file_name: str):
    results = []
    for vector_size in vector_sizes:
        print(vector_size)
        for clustering_method in clustering_methods:
            print(clustering_methods)
            for fitness_column in fitness_columns:
                print(fitness_column)
                pRef: PRef = get_pRef_from_vectors(get_vectors_file_name(data_folder, vector_size, clustering_method), 
                                         get_fitness_file_name(data_folder, vector_size, clustering_method), fitness_column)
                
                # solutions_to_explain = [pRef.get_random_evaluated_fs() for _ in range(qty_solutions_to_explain)]
                
                # for solution in solutions_to_explain:
                #     with utils_.announce(f"Calculating linkage information for {vector_size = }"):
                
                linkage_evaluator = ValueSpecificMutualInformation()
                linkage_evaluator.set_pRef(pRef)
                solution = pRef.get_random_evaluated_fs()
                linkage_evaluator.linkage_table = linkage_evaluator.get_linkage_table_for_solution(solution)
                
                print(pRef.sample_size)
                
                for search_metrics in search_metrics_combinations:
                    print(search_metrics)
                    search_metrics_as_functions = [metric_string_to_func(one_metric, pRef, linkage_evaluator) 
                                                    for one_metric in search_metrics.split()]
                    
                    def generate_datapoint():
                    
                        with announce(f"Searching for {vector_size = }, "
                                      f"{clustering_method = }, "
                                      f"{fitness_column = }, "
                                      f"{search_metrics =}"):
                            pss = search_global_polish_ps(pRef.search_space,
                                                         search_settings=search_settings,
                                                         objectives = search_metrics_as_functions)
                            
                            winning_ps = apply_culling_method(culling_method=None, pss=pss)[0]
                        
                        return {"vector_size": vector_size,
                                        "clustering_method": clustering_method,
                                        "fitness_column": fitness_column,
                                        "solution": repr(FullSolution(solution.values)),
                                        "solution_fitness": solution.fitness,
                                        "search_metrics": search_metrics,
                                        "resulting_pss": [ps_to_json(ps, pRef) for ps in pss],
                                        "winning_ps": ps_to_json(winning_ps, pRef)}
                        
                            
                        # try:
                    results.append(generate_datapoint())
                        # except Exception as e:
                        #     print(e)
                        #     results.append({"error": True,
                        #                     "message": repr(e)})
                        # 
    
    with utils_.open_and_make_directories(output_file_name) as file:
        json.dump(results, file, indent=4)
        
    print(f"Saved the results to {output_file_name}")
                    
                    

In [16]:
vector_sizes = [100]
clustering_methods = ["qmc"]
fitness_columns = [0]
qty_solutions_to_explain = 12
search_metrics_combinations = ["simplicity mean_fitness atomicity", 
                               "session_count mean_fitness atomicity",  # simplicity   becomes session_count
                               "simplicity fitness_p_value atomicity",  # mean_fitness becomes fitness_p_value
                               "simplicity split_variance atomicity",   # mean_fitness becomes split_variance
                               "simplicity mean_fitness dependency",    # atomicity    becomes dependency
                               "mean_fitness atomicity dependency"      # removed mean_fitness, added dependency
                               ]

def get_file_destination_name(extra_info):
    return "C:\\Users\\300522\\Documents\\GitHub\\ECXAI-2025-Stirling\\LightGCN\\data\\amazon-book\\patterns\\"+extra_info+utils_.get_formatted_timestamp()+".json"


search_settings = get_default_search_settings()
search_settings.metrics = None # just in case
search_settings.verbose = True # because it's annoying
search_settings.culling_method = None # because I want to capture the entire pareto front

for vector_size in vector_sizes:
    gather_data_for_settings(vector_sizes = [vector_size],
                             clustering_methods=clustering_methods,
                             fitness_columns=fitness_columns,
                             search_metrics_combinations=search_metrics_combinations,
                             search_settings=search_settings,
                             qty_solutions_to_explain=qty_solutions_to_explain,
                             output_file_name=get_file_destination_name(extra_info=f"vector_size_{vector_size}_")
                             )

100
['qmc']
0
MI data gathering progress: 0.00%
MI data gathering progress: 0.99%
MI data gathering progress: 1.97%
MI data gathering progress: 2.96%
MI data gathering progress: 3.95%
MI data gathering progress: 4.93%
MI data gathering progress: 5.92%
MI data gathering progress: 6.91%
MI data gathering progress: 7.89%
MI data gathering progress: 8.88%
MI data gathering progress: 9.87%
MI data gathering progress: 10.85%
MI data gathering progress: 11.84%
MI data gathering progress: 12.82%
MI data gathering progress: 13.81%
MI data gathering progress: 14.80%
MI data gathering progress: 15.78%
MI data gathering progress: 16.77%
MI data gathering progress: 17.76%
MI data gathering progress: 18.74%
MI data gathering progress: 19.73%
MI data gathering progress: 20.72%
MI data gathering progress: 21.70%
MI data gathering progress: 22.69%
MI data gathering progress: 23.68%
MI data gathering progress: 24.66%
MI data gathering progress: 25.65%
MI data gathering progress: 26.64%
MI data gathering

In [17]:
vector_size = 100
clustering_method = 'qmc'
fitness_column = 0
pRef: PRef = get_pRef_from_vectors(get_vectors_file_name(data_folder, vector_size, clustering_method), 
                     get_fitness_file_name(data_folder, vector_size, clustering_method), fitness_column)


linkage_evaluator = ValueSpecificMutualInformation()
linkage_evaluator.set_pRef(pRef)
solution = pRef.get_random_evaluated_fs()
linkage_evaluator.linkage_table = linkage_evaluator.get_linkage_table_for_solution(solution)

print(pRef.sample_size)

search_metrics = "session_count mean_fitness atomicity"
print(search_metrics)
search_metrics_as_functions = [metric_string_to_func(one_metric, pRef, linkage_evaluator) 
                                for one_metric in search_metrics.split()]
    
with announce(f"Searching for {vector_size = }, "
          f"{clustering_method = }, "
          f"{fitness_column = }, "
          f"{search_metrics =}"):
    pss = search_global_polish_ps(pRef.search_space,
                                 search_settings=search_settings,
                                 objectives = search_metrics_as_functions)
    
    winning_ps = apply_culling_method(culling_method=None, pss=pss)[0]
    
result =  {"vector_size": vector_size,
                    "clustering_method": clustering_method,
                    "fitness_column": fitness_column,
                    "solution": repr(FullSolution(solution.values)),
                    "solution_fitness": solution.fitness,
                    "search_metrics": search_metrics,
                    "resulting_pss": [ps_to_json(ps, pRef) for ps in pss],
                    "winning_ps": ps_to_json(winning_ps, pRef)}

MI data gathering progress: 0.00%
MI data gathering progress: 1.00%
MI data gathering progress: 1.99%
MI data gathering progress: 2.99%
MI data gathering progress: 3.99%
MI data gathering progress: 4.99%
MI data gathering progress: 5.98%
MI data gathering progress: 6.98%
MI data gathering progress: 7.98%
MI data gathering progress: 8.98%
MI data gathering progress: 9.97%
MI data gathering progress: 10.97%
MI data gathering progress: 11.97%
MI data gathering progress: 12.97%
MI data gathering progress: 13.96%
MI data gathering progress: 14.96%
MI data gathering progress: 15.96%
MI data gathering progress: 16.96%
MI data gathering progress: 17.95%
MI data gathering progress: 18.95%
MI data gathering progress: 19.95%
MI data gathering progress: 20.95%
MI data gathering progress: 21.94%
MI data gathering progress: 22.94%
MI data gathering progress: 23.94%
MI data gathering progress: 24.94%
MI data gathering progress: 25.93%
MI data gathering progress: 26.93%
MI data gathering progress: 27.