In [1]:
import os.path

from PolishSystem.read_data import get_pRef_from_vectors, get_vectors_file_name, get_fitness_file_name

# this section will generate the search objectives
size = 20
method = "qmc"
fitness_column_to_use = 0
data_folder = r"C:\Users\gac8\PycharmProjects\PSSearch\data\retail_forecasting"

sessions = get_pRef_from_vectors(get_vectors_file_name(data_folder, size, method), 
                                     get_fitness_file_name(data_folder, size, method), fitness_column_to_use)


print(sessions)

PRef with 52626 samples, mean = 0.88


In [2]:


from Core.PS import PS
from SimplifiedSystem.ps_search_utils import get_metric_function
from Core.PRef import PRef
from PolishSystem.PolishPSDecisionTree import PolishPSDecisionTree
from SimplifiedSystem.PSSearchSettings import get_default_search_settings

search_settings = get_default_search_settings()
search_settings.verbose = True
search_settings.ps_search_budget=1000   # the recommended quantity is 5000 (this is per split)


def get_objectives_for_partition(pRef: PRef):
    
    # get_metric_function("sample_count", sessions) to count how many sessions are covered by a pattern
    
    def reject_if_overly_specific(ps: PS):
        match_count = len(pRef.fitnesses_of_observations(ps))
        if match_count < 1:
            return 100
        else:
            return 0
        
    return [get_metric_function("variance", pRef)] # PS-SW
    #return [get_metric_function("consistency", pRef), reject_if_overly_specific]


tree = PolishPSDecisionTree(maximum_depth=3, 
                            search_settings=search_settings,
                            get_objectives_for_partition=get_objectives_for_partition)

tree.train_from_pRef(sessions)

Splitting a pRef of size 52626
n_gen  |  n_eval  |     cv_min    |     cv_avg    |     f_avg     |     f_min    
     1 |       61 |  0.000000E+00 |  0.000000E+00 |  0.0341920000 |  0.0340722684
     2 |      161 |  0.000000E+00 |  0.000000E+00 |  0.0341848555 |  0.0340722684
     3 |      261 |  0.000000E+00 |  0.000000E+00 |  0.0341753352 |  0.0340722684
     4 |      361 |  0.000000E+00 |  0.000000E+00 |  0.0341616245 |  0.0340722684
     5 |      461 |  0.000000E+00 |  0.000000E+00 |  0.0341473358 |  0.0340722684
     6 |      561 |  0.000000E+00 |  0.000000E+00 |  0.0341365455 |  0.0340603232
     7 |      661 |  0.000000E+00 |  0.000000E+00 |  0.0341270958 |  0.0340603232
     8 |      760 |  0.000000E+00 |  0.000000E+00 |  0.0341220027 |  0.0340603232
     9 |      860 |  0.000000E+00 |  0.000000E+00 |  0.0341202216 |  0.0340603232
    10 |      960 |  0.000000E+00 |  0.000000E+00 |  0.0341195243 |  0.0340603232
    11 |     1060 |  0.000000E+00 |  0.000000E+00 |  0.0341193154 |

In [3]:
tree.print_ASCII()

Root
(Branch) 	
	[* * 1 * * * * * * * * * * 1 * * * * * *][0.034, ]	
	stats:n->52626, average->0.8756650999718619, variance->0.034203744445680306, sd->0.18494254363363857, mse->0.034203744445680306, mae->0.13565800144060147, min->0.0007308570784516633, max->1.0
├── Matching
│   (Branch) 	
│   	[* * 1 * * 1 * * * * 1 * * * * * 1 * * *][0.045, ]	
│   	stats:n->6788, average->0.844544448861014, variance->0.044702861752257364, sd->0.21143051282219735, mse->0.044702861752257364, mae->0.1629103521049833, min->0.0035843465011566877, max->1.0
│   ├── Matching
│   │   (Branch) 	
│   │   	[* * * * * * 1 * * * * * * * * 1 * * * *][0.055, ]	
│   │   	stats:n->1464, average->0.8197891021849043, variance->0.05517101354954125, sd->0.23488510712589092, mse->0.05517101354954125, mae->0.18620126771611445, min->0.004234409425407648, max->1.0
│   │   ├── Matching
│   │   │   (Leaf) stats:n->578, average->0.8450122147429554, variance->0.04946068207282306, sd->0.22239757658936632, mse->0.04946068207282306, 

In [7]:
import numpy as np
from DecisionTree.PSDecisionTree import PSDecisionTreeNode


def repr_95_confidence_interval(node: PSDecisionTreeNode):
    fitnesses = node.fitnesses
    
    if len(fitnesses) < 2:
        return f"{fitnesses}"
    mean = np.average(fitnesses)
    a, b = np.percentile(fitnesses, [2.5, 97.5])

    return f"(Branch), n = {len(fitnesses)}, {mean = :.2f},  confidence interval = [{a:.3f}, {b:.3f}]"

tree.print_ASCII(custom_partition_repr=repr_95_confidence_interval)
    

In [2]:
import os
filename = os.path.join(r"C:\Users\gac8\PycharmProjects\PSSearch\initial_testing\decision_tree_attempts", "decision_tree.json")

In [None]:
tree.to_file(filename)

In [11]:
from DecisionTree.PSDecisionTree import PSDecisionTree

# NOTE: since PolishPSDecisionTree is a subclass of PSDecision tree, we can use the original class to read it
# At the moment I don't have a custom way of importing the polish trees...

other_tree = PSDecisionTree.from_file(filename)

# note that right after the tree was built, each node had the partition fitnesses, but these are not stored in the json (that is a lot of data!)
# to put them back into the tree, call this vvvv
other_tree.post_hoc_set_fitnesses(sessions)

other_tree.print_ASCII(custom_partition_repr=repr_95_confidence_interval)

Root
(Branch) 	
	[* * 1 * * * * * * * * * * 1 * * * * * *]	
	(Branch), n = 52626, mean = 0.88,  confidence interval = [0.318, 1.000]
├── Matching
│   (Branch) 	
│   	[* * 1 * * 1 * * * * 1 * * * * * 1 * * *]	
│   	(Branch), n = 6788, mean = 0.84,  confidence interval = [0.243, 1.000]
│   ├── Matching
│   │   (Branch) 	
│   │   	[* * * * * * 1 * * * * * * * * 1 * * * *]	
│   │   	(Branch), n = 1464, mean = 0.82,  confidence interval = [0.167, 1.000]
│   │   ├── Matching
│   │   │   (Leaf) (Branch), n = 578, mean = 0.85,  confidence interval = [0.186, 1.000]
│   │   └── NOT matching
│   │       (Leaf) (Branch), n = 886, mean = 0.80,  confidence interval = [0.165, 1.000]
│   └── NOT matching
│       (Branch) 	
│       	[* 1 1 * 1 1 1 * * * * 1 1 * * 1 1 * * *]	
│       	(Branch), n = 5324, mean = 0.85,  confidence interval = [0.267, 1.000]
│       ├── Matching
│       │   (Leaf) (Branch), n = 2, mean = 0.31,  confidence interval = [0.285, 0.331]
│       └── NOT matching
│           (Leaf)