In [43]:
import heapq
import math

import utils
from Core.PRef import PRef
import numpy as np


class SPRef:
    sessions: list[set[int]]
    fitnesses: np.ndarray
    
    def __init__(self,
                 sessions: list[set[int]],
                 fitnesses: np.ndarray):
        self.sessions = sessions
        self.fitnesses = fitnesses
        
        
    @classmethod
    def from_pRef(cls, pRef: PRef):
        def row_to_set(row):
            return {index for index, value in enumerate(row) if value}
        return cls(list(map(row_to_set, pRef.full_solution_matrix)), pRef.fitness_array)

    
    
    def get_matching_fitnesses(self, ps: set[int], threshold: float):
        most_leftover = math.floor(len(ps) * (1-threshold))
        fitnesses = []
        for session, fitness in zip(self.sessions, self.fitnesses):
            if len(ps-session) <= most_leftover:
                fitnesses.append(fitness)
        return np.array(fitnesses)
    
    def get_matching_fitnesses_and_not_matching(self, ps: set[int], threshold: float):
        if len(ps) == 0 and threshold > 0:
            return np.array([]), self.fitnesses # shortcut
        
        most_leftover = math.floor(len(ps) * (1-threshold))
        matches = []
        non_matches = []
        
        for session, fitness in zip(self.sessions, self.fitnesses):
            if len(ps-session) < most_leftover:
                matches.append(fitness)
            else:
                non_matches.append(fitness)
        return np.array(matches), np.array(non_matches)
    
    
    
class OptimisedSPref(SPRef):
    which_sessions: list[set[int]]
    
    def __init__(self, sessions: list[set[int]],
                 fitnesses: np.ndarray):
        super().__init__(sessions, fitnesses)
        
        max_product = max(product  for session in self.sessions
                                 for product in session)
        self.which_sessions = [{index for index, session in enumerate(sessions)
                                if product in session}
                               for product in range(max_product+1)]
        
    
    def get_matching_fitnesses_and_not_matching(self, ps: set[int], threshold: float):
        if len(ps) == 0 and threshold > 0:
            return self.fitnesses, np.array([]) # shortcut
        
        most_leftover = math.floor(len(ps) * (1-threshold))
        index_matches = set()
        possible_session_indices = set.union(*(self.which_sessions[product] for product in ps))
        index_non_matches = set(range(len(self.sessions))) - possible_session_indices
        
        for index in possible_session_indices:
            session = self.sessions[index]
            if len(ps-session) < most_leftover:
                index_matches.add(index)
            else:
                index_non_matches.add(index)
                
        return np.array(self.fitnesses[list(index_matches)]), np.array(self.fitnesses[list(index_non_matches)])
        
    
    def get_matching_fitnesses(self, ps: set[int], threshold: float):
        return self.get_matching_fitnesses_and_not_matching(ps, threshold)[0]
    
    


In [44]:
from Gian_experimental.NSGAIICustom.NSGAIICustom import NCSolution
from scipy.stats import mannwhitneyu
import itertools


def complexity(ps):
    return len(ps)

def make_mean_fitness(sPRef: SPRef, threshold: float = 0.5):
    global_average = np.average(sPRef.fitnesses)
    terrible = np.min(sPRef.fitnesses)
    def mean_fitness(ps):
        if len(ps) == 0:
            return global_average  # shortcut
        match_fitnesses = sPRef.get_matching_fitnesses(ps, threshold=threshold)
        if len(match_fitnesses) > 0:
            return np.average(match_fitnesses)
        else:
            return terrible
        
    return mean_fitness


def make_similarity_atomicity(similarities):
    def atomicity(ps):
        if len(ps) < 2:
            return -1000
        else:
            linkages = [similarities[a, b] for a, b in itertools.combinations(ps, r=2)]
            return np.average(linkages)
    return atomicity

def make_consistency_metric(sPRef: SPRef, 
                            threshold: float = 0.5,
                            must_match_at_least: int = 3):
    def consistency(ps):
        matches, non_matches = sPRef.get_matching_fitnesses_and_not_matching(ps, threshold=threshold)
        if len(matches) < must_match_at_least:
            return 1
        else:
            test = mannwhitneyu(matches, non_matches, alternative="greater", method="asymptotic")
        return test.pvalue
    
    return consistency


def make_consistency_metric_with_sample_size(sPRef: SPRef, 
                            threshold: float = 0.5,
                            must_match_at_least: int = 3):
    def consistency_and_sample(ps):
        matches, non_matches = sPRef.get_matching_fitnesses_and_not_matching(ps, threshold=threshold)
        if min(len(matches), len(non_matches)) < must_match_at_least:
            return 1, len(matches)
        else:
            print("AAA", len(matches), len(non_matches))
            test = mannwhitneyu(matches, non_matches, alternative="greater", method="asymptotic")
        return test.pvalue, len(matches)
    
    return consistency_and_sample


def make_min_metric_with_sample_size(sPRef: SPRef, 
                                    threshold: float = 0.5):
    def min_and_sample(ps):
        matches, non_matches = sPRef.get_matching_fitnesses_and_not_matching(ps, threshold=threshold)
        if  min(len(matches), len(non_matches)) < 1:
            return (-1000, len(matches))
        else:
            lowest_fitness = np.min(matches)
        return lowest_fitness, len(matches)
    
    return min_and_sample


class HashedSolution:
    solution: NCSolution
    
    def __init__(self,
                 sol):
        self.solution = sol
    
    def __hash__(self):
        return sum(self.solution) % 7787
    
    def __eq__(self, other):
        return self.solution == other.solution


def make_metrics_cached(metrics):
    cached_values = dict()
    def get_values(ps):
        wrapped = HashedSolution(ps)
        if wrapped in cached_values:
            return cached_values[wrapped]
        else:

            value = metrics(ps)
            cached_values[wrapped] = value
            return value
    return get_values
        
    

In [45]:
from PolishSystem.OperatorsBasedOnSimilarities.similarities_utils import gian_get_similarities
from PolishSystem.read_data import get_pRef_from_vectors
import os

dir_250 = r"C:\Users\gac8\PycharmProjects\PSSearch\data\retail_forecasting\250"

def in_250(path):
    return os.path.join(dir_250, path)

pRef = get_pRef_from_vectors(name_of_vectors_file=in_250("many_hot_vectors_250_qmc.csv"),
                             name_of_fitness_file=in_250("fitness_250_qmc.csv"),
                             column_in_fitness_file=2)

train_pRef, test_pRef = pRef.train_test_split(test_size=0.2)
cluster_info_file_name = in_250(f"cluster_info_250_qmc.pkl")
similarities = gian_get_similarities(cluster_info_file_name)

In [46]:
train_sPRef = SPRef.from_pRef(train_pRef)
test_sPRef = SPRef.from_pRef(test_pRef)

fast_train_sPRef = OptimisedSPref(train_sPRef.sessions, fitnesses=train_sPRef.fitnesses)

train_mean_fitness = make_mean_fitness(fast_train_sPRef, threshold=0.5)
train_atomicity = make_similarity_atomicity(similarities)
train_consistency = make_consistency_metric(fast_train_sPRef, threshold=0.5, must_match_at_least=3)
train_consistency_and_sample = make_consistency_metric_with_sample_size(fast_train_sPRef, threshold=0.5, must_match_at_least=3)
train_min_and_sample = make_min_metric_with_sample_size(fast_train_sPRef, threshold=0.5)

test_consistency = make_consistency_metric(test_sPRef, threshold=0.5, must_match_at_least=100)


In [47]:


from PolishSystem.OperatorsBasedOnSimilarities.similarities_utils import get_transition_matrix
from typing import Iterable
from Gian_experimental.NSGAIICustom.NSGAIICustom import NSGAIICustom, NCSolution, NCSamplerSimple, NCMutationSimple, \
    NCCrossoverSimple, EvaluatedNCSolution, NCSamplerFromPRef, NCCrossoverTransition
import heapq


def get_metrics(ps: NCSolution) -> tuple[float]:
    min_fit, sample_size = train_min_and_sample(ps)
    atomicity = train_atomicity(ps)
    return (-sample_size, -min_fit, -atomicity)

n = 250

def keep_ones_with_most_samples(population: Iterable[EvaluatedNCSolution], quantity_required: int):
    return heapq.nsmallest(iterable=population, key=lambda x: x.fitnesses[0], n=quantity_required)


#custom_sampling = NCSamplerFromPRef(test_pRef)
transition_matrix = get_transition_matrix(similarities)
custom_crossover = NCCrossoverTransition(similarities)

traditional_sampling = NCSamplerSimple.with_average_quantity(3, genome_size=n)
traditional_mutation = NCMutationSimple(n)

traditional_crossover = NCCrossoverSimple(swap_probability=1 / n)

algorithm = NSGAIICustom(sampling=traditional_sampling,
                         mutation=traditional_mutation,
                         crossover=traditional_crossover,
                         probability_of_crossover=0.5,
                         eval_budget=5000,
                         pop_size=100,
                         tournament_size=3,
                         mo_fitness_function=make_metrics_cached(get_metrics),
                         unique=True,
                         verbose=True,
                         culler=keep_ones_with_most_samples
                         )

In [48]:
if False:
    results = algorithm.run()
    
    print(results)

In [49]:
import utils
test_consistency = make_consistency_metric(test_sPRef, threshold=0.5, must_match_at_least=10)
p_values_on_train = [train_consistency(solution.solution) for solution in results]
p_values_on_test = [test_consistency(ps.solution) for ps in results]


def ss(ps, sPRef):
    return len(sPRef.get_matching_fitnesses(ps, threshold = 0.5))

ss_on_train = [ss(ps.solution, train_sPRef) for ps in results]
ss_on_test = [ss(ps.solution, test_sPRef) for ps in results]

utils.simple_scatterplot(xs=p_values_on_test, ys=p_values_on_train, x_label="p-value test", y_label="p-value train")
utils.simple_scatterplot(xs=ss_on_test, ys=ss_on_train, x_label="sample test", y_label="sample train")

for item in results:
    ps = item.solution

def count_frequencies(iterable):
    
    iterable_list = list(iterable)
    counts = {item: iterable_list.count(item)
              for item in set(iterable)}
    
    for key, count in counts.items():
        print(key, count)


combinations = [(on_train < 0.05, on_test < 0.05) for on_train, on_test in zip(p_values_on_train, p_values_on_test)]

count_frequencies(combinations)

NameError: name 'results' is not defined

In [50]:
ss_on_train = [ss(ps.solution, train_sPRef) for ps in results]
ss_on_fast_train = [ss(ps.solution, fast_train_sPRef) for ps in results]

utils.simple_scatterplot(xs = ss_on_train, ys = ss_on_fast_train, x_label="recalculated", y_label="in thing")

NameError: name 'results' is not defined

In [51]:
for _ in range(12):
    random_ps = traditional_sampling.sample()
    ss_on_slow = ss(random_ps, train_sPRef)
    ss_on_fast = ss(random_ps, fast_train_sPRef)
    
    
    
    if ss_on_slow != ss_on_fast:
        print(ss_on_slow, ss_on_fast)

NameError: name 'ss' is not defined

In [52]:
ps = {1, 2, 3}
threshold = 0.5

def get_ss_old(ps, sPref: SPRef):
    if len(ps) == 0 and threshold > 0:
        return sPref.fitnesses, np.array([]) # shortcut
        
    most_leftover = math.floor(len(ps) * (1-threshold))
    matches = []
    non_matches = []
    
    for session, fitness in zip(sPref.sessions, sPref.fitnesses):
        if len(ps-session) < most_leftover:
            matches.append(fitness)
        else:
            non_matches.append(fitness)
    return np.array(matches), np.array(non_matches)


def get_ss_new(ps, sPRef: OptimisedSPref):
    if len(ps) == 0 and threshold > 0:
        return sPRef.fitnesses, np.array([]) # shortcut
    
    most_leftover = math.floor(len(ps) * (1-threshold))
    index_matches = set()
    possible_session_indices = set.union(*(sPRef.which_sessions[product] for product in ps))
    index_non_matches = set(range(len(sPRef.sessions))) - possible_session_indices
    
    for index in possible_session_indices:
        session = sPRef.sessions[index]
        if len(ps-session) < most_leftover:
            index_matches.add(index)
        else:
            index_non_matches.add(index)
            
    return np.array(sPRef.fitnesses[list(index_matches)]), np.array(sPRef.fitnesses[list(index_matches)])

In [None]:

for _ in range(1000):
    ps = traditional_sampling.sample()
    
    from_old = (len(get_ss_old(ps, train_sPRef)[0]))
    from_new = train_consistency_and_sample(ps)[1]
    if from_old != from_new:
        print("---")
        print(ps)
        print(from_old)
        print(from_new)

AAA 83 42018
AAA 60 42041
AAA 77 42024
AAA 14 42087
AAA 12 42089
AAA 13 42088
AAA 158 41943
AAA 13 42088
AAA 19 42082
AAA 53 42048
AAA 33 42068
AAA 62 42039
AAA 7 42094
AAA 17 42084
AAA 39 42062
AAA 53 42048
AAA 23 42078
AAA 37 42064
AAA 24 42077
AAA 6 42095
AAA 143 41958
AAA 9 42092
AAA 5 42096
AAA 20 42081
AAA 11 42090
AAA 164 41937
AAA 55 42046
AAA 40 42061
AAA 62 42039
AAA 33 42068
AAA 11 42090
AAA 5 42096
AAA 14 42087
AAA 11 42090
AAA 28 42073
AAA 65 42036
AAA 8 42093
AAA 157 41944
AAA 83 42018
AAA 35 42066
AAA 22 42079
AAA 103 41998
AAA 39 42062
AAA 9 42092
AAA 28 42073
AAA 62 42039
AAA 144 41957
AAA 83 42018
AAA 15 42086
AAA 60 42041
AAA 18 42083
AAA 99 42002
AAA 62 42039
AAA 16 42085
AAA 4 42097
AAA 81 42020
AAA 13 42088
AAA 41 42060
AAA 40 42061
AAA 203 41898
AAA 88 42013
AAA 20 42081
AAA 23 42078
AAA 123 41978
AAA 11 42090
AAA 17 42084
AAA 45 42056
AAA 141 41960
AAA 12 42089
AAA 77 42024
AAA 75 42026
AAA 89 42012
AAA 18 42083
AAA 177 41924
AAA 99 42002
AAA 15 42086
AAA 104 41