In [1]:
import datetime
import time

import pandas as pd

from Code.qualityMeasure import *
from Code.dataImporter import getData

from Code.BeamSearch.BeamSearch import *
from Code.BUSPaS.BUSPaS import *
from Code.MCTS4DM.MCTS4DM import *

from analysisHelperFunctions import *

In [2]:
#TODO NOW: relocate where Swap Randomization results are saved (not in folder of the search strategy)

#TODO This version: Diversity Checks
#TODO This version: Add experiments for multiple countries
#TODO This version: Hyper-parameter optimization

#TODO Next version: Numerical attributes instead of pre-categorization
#TODO Next version: BUSPaS Anytime
#TODO Next version: Beam Search Anytime?

## load data

In [3]:
#TODO Before run check these:
test_with_small_data = True
create_new_results_dataframe = False


#Pre-processed data
df_start, cat, num, features = getData('Stock5YSmall',countries='large_economies',attributes='expertBased')

df_work = df_start.copy()
df_work = df_work.reset_index(drop=True)

df_work['target'] = df_work['target'].apply(percent_change_norm)


euclidean_slope_distance_matrix = np.load("C:/Users/bengelen004/Documents/TUe/BUSPas/New Submission WIP/Archive/20230602 Code/euclidean_slope_distance_matrix.npy")


if test_with_small_data:
    size = 1000
    df_work = df_work[:size]
    euclidean_slope_distance_matrix = euclidean_slope_distance_matrix[:size,:size]

  descriptive_df = descriptive_df.replace("'", '', regex=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merge[col].replace(0, np.nan, inplace=True)


## Search Strategy Comparison

In [13]:
# Search Strategy parameters: General
d = 3
q = 20
bins = 5
min_cov = 0.01
min_coverage_abs = 2
correct_for_size = no_size_corr
comparison_type = 'complement'

# Search Strategy parameters: Beam Search
w = 10
min_error = 0.01
ensure_diversity = True

# Search Strategy parameters: BUSPaS
number_of_row_pairs = 500
z=2 

# Search Strategy parameters: MCTS4DM
max_time_minutes = None #is connected to BUSPaS duration
ucb_type = 'SP-MCTS'
ucb_params = {}
roll_out_strategy =  'direct-freq'
reward_policy = 'max_path'
reward_policy_k = 3
memory_policy = 'all'
memory_policy_k_value =  3
update_policy = 'max_update'
update_policy_k = 3

config = {
    'd':d,
    'q':q,
    'bins':bins,
    'min_cov':min_cov,
    'min_coverage_abs':min_coverage_abs,
    'correct_for_size':correct_for_size,
    'comparison_type':comparison_type,
    'w':w, # Search Strategy parameters: Beam Search
    'min_error':min_error,
    'ensure_diversity':ensure_diversity,
    'number_of_row_pairs':number_of_row_pairs, # Search Strategy parameters: BUSPaS    
    'z':z,
    'max_time_minutes':max_time_minutes, # Search Strategy parameters: MCTS4DM #is connected to BUSPaS duration
    'ucb_type':ucb_type,
    'ucb_params':ucb_params,
    'roll_out_strategy':roll_out_strategy,
    'reward_policy':reward_policy,
    'reward_policy_k':reward_policy_k,
    'memory_policy':memory_policy,
    'memory_policy_k_value':memory_policy_k_value,
    'update_policy':update_policy,
    'update_policy_k':update_policy_k
}

#TODO Discuss with Wouter if this matters
df_work = convert_columns_num_to_cat(df_work,nr_chunks=bins)

if create_new_results_dataframe:
    df_results = pd.DataFrame(columns=['Max Quality','Avg Quality','Max Coverage','Avg Coverage','Subgroups Checked','Time','Ratio n.q.','configuration','result','Object'])
    df_results.to_pickle('./results/results.pkl')

In [14]:
######### ----- ----- ##########
# Beam Search
print('Start - Beam Search at ', time.strftime('%Y-%m-%d %H:%M:%S'))


beam_search = BeamSearch(df_work)
bs = beam_search.EMM( features,
     w=w,
     d=d,
     q=q,
     quality_measure=cluster_based_quality_measure,
     catch_all_description=[],
     comparison_type=comparison_type,
     target='target',
     n_chunks=bins,
     ensure_diversity=ensure_diversity,
     report_progress= False,
     allow_exclusion=False,
     min_coverage = min_cov,
     min_coverage_abs = min_coverage_abs,
     min_error = min_error,
     distance_matrix=euclidean_slope_distance_matrix,
     correct_for_size=correct_for_size,
     show_result=False)

bs_result = beam_search.result

print('End - Beam Search at ', time.strftime('%Y-%m-%d %H:%M:%S'))
######### ----- ----- ##########
# BUSPaS
print('Start - BUSPaS at ', time.strftime('%Y-%m-%d %H:%M:%S'))


bottom_up = BUSPaS(df_work,
    euclidean_slope_distance_matrix,
    number_of_row_pairs=number_of_row_pairs,
    depth=d,
    q=q,
    z=z,
    nr_chunks=bins,
    min_coverage_perc=min_cov,
    min_coverage_abs=min_coverage_abs)
# bottom_up.num_to_cat_attribute_converter() #not needed for this 
bottom_up.find_quality(quality_measure = cluster_based_quality_measure, comparison_type = comparison_type , size_corr = correct_for_size)


print('End - BUSPaS at ', time.strftime('%Y-%m-%d %H:%M:%S'))
######### ----- ----- ##########
# MCTS4DM
print('Start - MCTS4DM at ', time.strftime('%Y-%m-%d %H:%M:%S'))


monte_carlo = MCTS4DM(df_work, 'target',
    q = q,
    root_description = [],
    n_chunks = bins,
    allow_exclusion = False,
    minutes = bottom_up.duration/60,
    max_nr_iterations = float('inf'),
    ucb_type=ucb_type, #or UCB1, UCB1-Tuned, SP-MCTS, UCT, DFS-UCT
    ucb_params=ucb_params,
    quality_params={},
    matrix = euclidean_slope_distance_matrix,
    size_correction_method = correct_for_size,
    max_desc_length = d,
    min_coverage = min_cov,
    roll_out_strategy = roll_out_strategy, #or large-freq or naive
    reward_policy = reward_policy, #or random_pick, mean_path, mean_top_k
    reward_policy_k = reward_policy_k,
    memory_policy = memory_policy, #or top_k
    memory_policy_k_value = memory_policy_k_value,
    update_policy = update_policy, #or mean_update, top_k_mean_update
    update_policy_k = update_policy_k,
    show_progress = False,
    show_result=False)
monte_carlo.run()

print('Start - MCTS4DM at ', time.strftime('%Y-%m-%d %H:%M:%S'))
######### ----- ----- ##########
#Save results

df_results = pd.read_pickle('./results/results.pkl')
len_df = len(df_results)
new_rows = pd.DataFrame({'Max Quality':[beam_search.max_quality,bottom_up.max_quality,monte_carlo.max_quality],
                        'Avg Quality':[beam_search.avg_quality,bottom_up.avg_quality,monte_carlo.avg_quality],
                        'Max Coverage':[beam_search.max_coverage,bottom_up.max_coverage,monte_carlo.max_coverage],
                        'Avg Coverage':[beam_search.avg_coverage,bottom_up.avg_coverage,monte_carlo.avg_coverage],
                        'Subgroups Checked':[beam_search.count_quality,bottom_up.count_quality,monte_carlo.count_quality],
                        'Time':[beam_search.duration,bottom_up.duration,monte_carlo.duration],
                        'Ratio n.q.':[(1- (beam_search.duration_quality/beam_search.duration)),(1- (bottom_up.duration_quality/bottom_up.duration)),(1- (monte_carlo.duration_quality/monte_carlo.duration))],
                        'configuration':[config,config,config],
                        'result':[bs_result,bottom_up.result,monte_carlo.result],
                        'Object':[beam_search,bottom_up,monte_carlo]},
                       index=[f'Beam Search {len_df/3+1}',f'BUSPaS {len_df/3+1}',f'MCTS4DM {len_df/3+1}'])
df_results = pd.concat([df_results, new_rows])
df_results.to_pickle('./results/results.pkl')

df_results_latex = df_results.round(3).astype(str)
print(df_results_latex[['Max Quality','Avg Quality','Max Coverage','Avg Coverage','Subgroups Checked','Time','Ratio n.q.']].to_latex())
df_results[['Max Quality','Avg Quality','Max Coverage','Avg Coverage','Subgroups Checked','Time','Ratio n.q.']]

Start - Beam Search at  2024-10-29 15:48:30
End - Beam Search at  2024-10-29 15:48:33
Start - BUSPaS at  2024-10-29 15:48:33
End - BUSPaS at  2024-10-29 15:48:36
Start - MCTS4DM at  2024-10-29 15:48:36
Start - MCTS4DM at  2024-10-29 15:48:38
\begin{tabular}{llllllll}
\toprule
 & Max Quality & Avg Quality & Max Coverage & Avg Coverage & Subgroups Checked & Time & Ratio n.q. \\
\midrule
Beam Search 1.0 & 2.056 & 1.686 & 0.033 & 0.014 & 142 & 3.633 & 0.955 \\
BUSPaS 1.0 & 2.107 & 1.784 & 0.029 & 0.014 & 735 & 4.835 & 0.835 \\
MCTS4DM 1.0 & 2.139 & 1.698 & 0.033 & 0.015 & 454 & 4.855 & 0.893 \\
Beam Search 2.0 & 2.056 & 1.686 & 0.033 & 0.014 & 142 & 3.0 & 0.955 \\
BUSPaS 2.0 & 2.107 & 1.784 & 0.029 & 0.014 & 735 & 4.411 & 0.856 \\
MCTS4DM 2.0 & 1.867 & 1.672 & 0.041 & 0.015 & 462 & 4.42 & 0.868 \\
Beam Search 3.0 & 2.756 & 2.539 & 0.02 & 0.012 & 199 & 1.462 & 0.961 \\
BUSPaS 3.0 & 3.551 & 2.749 & 0.017 & 0.011 & 774 & 1.835 & 0.916 \\
MCTS4DM 3.0 & 2.929 & 2.247 & 0.063 & 0.015 & 499 & 1.8

Unnamed: 0,Max Quality,Avg Quality,Max Coverage,Avg Coverage,Subgroups Checked,Time,Ratio n.q.
Beam Search 1.0,2.056,1.686,0.033,0.014,142,3.63291,0.9549
BUSPaS 1.0,2.107,1.784073,0.029,0.0135,735,4.835353,0.835464
MCTS4DM 1.0,2.139,1.698,0.033,0.015,454,4.854622,0.892693
Beam Search 2.0,2.056,1.686,0.033,0.014,142,3.000232,0.954794
BUSPaS 2.0,2.107,1.784073,0.029,0.0135,735,4.411428,0.856217
MCTS4DM 2.0,1.867,1.672,0.041,0.015,462,4.419702,0.868283
Beam Search 3.0,2.756,2.539,0.02,0.012,199,1.461621,0.960758
BUSPaS 3.0,3.551,2.749046,0.017,0.011,774,1.834792,0.916491
MCTS4DM 3.0,2.929,2.247,0.063,0.015,499,1.841228,0.930991
Beam Search 4.0,2.756,2.539,0.02,0.012,199,1.648426,0.960287


In [6]:
# #TODO if needed make selection of table instead of full table
# sel = [1,2,3]
# df_results_latex = df_results.round(3).astype(str)
# print(df_results[['Max Quality','Avg Quality','Max Coverage','Avg Coverage','Subgroups Checked','Time','Ratio n.q.']].iloc[sel].to_latex())
# df_results[['Max Quality','Avg Quality','Max Coverage','Avg Coverage','Subgroups Checked','Time','Ratio n.q.']].iloc[sel]

\begin{tabular}{lrrrrlrr}
\toprule
 & Max Quality & Avg Quality & Max Coverage & Avg Coverage & Subgroups Checked & Time & Ratio n.q. \\
\midrule
BUSPaS 1.0 & 2.107000 & 1.784073 & 0.029000 & 0.013500 & 735 & 4.835353 & 0.835464 \\
MCTS4DM 1.0 & 2.139000 & 1.698000 & 0.033000 & 0.015000 & 454 & 4.854622 & 0.892693 \\
Beam Search 2.0 & 2.056000 & 1.686000 & 0.033000 & 0.014000 & 142 & 3.000232 & 0.954794 \\
\bottomrule
\end{tabular}



Unnamed: 0,Max Quality,Avg Quality,Max Coverage,Avg Coverage,Subgroups Checked,Time,Ratio n.q.
BUSPaS 1.0,2.107,1.784073,0.029,0.0135,735,4.835353,0.835464
MCTS4DM 1.0,2.139,1.698,0.033,0.015,454,4.854622,0.892693
Beam Search 2.0,2.056,1.686,0.033,0.014,142,3.000232,0.954794


## Swap-Randomization

In [15]:
#TODO Choose k
k = 10

#select results to do statistical testing for
results_with_p_values = []
#TODO Choose selection
sel = list(df_results.index[-3:]) #['BUSPaS 1.0','Beam Search 2.0']
df_objects = pd.read_pickle('./results/results.pkl').loc[sel]
path = 'statistical_tests_'+str(time.time())+'.pkl'
c=0

for search_strategy_configuration in df_objects['Object']:
    print('Test ',c+1,'/',len(sel))
    results_with_p_values.append([sel[c],search_strategy_configuration.run_statistical_test(k=k,show_progress_pvals=True,save_path='saved_results_statistical_test_BUSPaS_')])
    c+=1
    with open(path, 'wb') as f:
        pickle.dump(results_with_p_values, f)
results_with_p_values

Test  1 / 3
Start beam search statistical test:  0 / 10
Start beam search statistical test:  1 / 10
Start beam search statistical test:  2 / 10
Start beam search statistical test:  3 / 10
Start beam search statistical test:  4 / 10
Start beam search statistical test:  5 / 10
Start beam search statistical test:  6 / 10
Start beam search statistical test:  7 / 10
Start beam search statistical test:  8 / 10
Start beam search statistical test:  9 / 10
Test  2 / 3
Start buspas statistical test:  0 / 10
Start buspas statistical test:  1 / 10
Start buspas statistical test:  2 / 10
Start buspas statistical test:  3 / 10
Start buspas statistical test:  4 / 10
Start buspas statistical test:  5 / 10
Start buspas statistical test:  6 / 10
Start buspas statistical test:  7 / 10
Start buspas statistical test:  8 / 10
Start buspas statistical test:  9 / 10
Test  3 / 3
Start mcts4dm statistical test:  0 / 10
Start mcts4dm statistical test:  1 / 10
Start mcts4dm statistical test:  2 / 10
Start mcts4dm 

[['Beam Search 5.0',
  [(np.float32(2.0562809),
    0.01,
    110,
    ["industry == 'Gold'",
     "averageVolume10days == '20977.399999999998 <= averageVolume10days <= 245951.60000000003'"],
    {},
    np.float64(0.09090909090909091)),
   (np.float32(1.8828576),
    0.01,
    116,
    ["industry == 'Information Technology Services'", "currency == 'EUR'"],
    {},
    np.float64(0.18181818181818182)),
   (np.float32(1.8671579),
    0.017,
    103,
    ["industry == 'Information Technology Services'"],
    {},
    np.float64(0.18181818181818182)),
   (np.float32(1.8194999),
    0.01,
    55,
    ["country == 'Spain'"],
    {},
    np.float64(0.18181818181818182)),
   (np.float32(1.8098099),
    0.012,
    108,
    ["industry == 'Gold'",
     "debtToEquity == '0.01 <= debtToEquity <= 10.6624'"],
    {},
    np.float64(0.18181818181818182)),
   (np.float32(1.7727151),
    0.011,
    94,
    ["industry == ''"],
    {},
    np.float64(0.2727272727272727)),
   (np.float32(1.7727151),
    0.

In [44]:
for result in results_with_p_values:
    search_type, subgroups = result[0], result[1]
    if 'MCTS4DM' in search_type:
        print(mcts_pval_result_to_latex(subgroups)[1])
        print(mcts_pval_result_to_latex(subgroups)[0])
        
    elif 'BUSPaS' in search_type:
        print(bus_pval_result_to_latex(subgroups)[1])
        print(bus_pval_result_to_latex(subgroups)[0])
        
    elif 'Beam Search' in search_type:
        print(bs_pval_result_to_latex(subgroups)[1])
        print(bs_pval_result_to_latex(subgroups)[0])
        

\begin{tabular}{lllll}
\toprule
 & Description & Quality & Coverage & p-Value \\
\midrule
1 & (industry='Gold') Ʌ (averageVolume10days='20977.399999999998 <= averageVolume10days <= 245951.60000000003') & 2.056 & 0.01 & 0.091 \\
2 & (industry='Information Technology Services') Ʌ (currency='EUR') & 1.883 & 0.01 & 0.182 \\
3 & (industry='Information Technology Services') & 1.867 & 0.017 & 0.182 \\
4 & (country='Spain') & 1.819 & 0.01 & 0.182 \\
5 & (industry='Gold') Ʌ (debtToEquity='0.01 <= debtToEquity <= 10.6624') & 1.81 & 0.012 & 0.182 \\
6 & (industry='') & 1.773 & 0.011 & 0.273 \\
7 & (sector='') & 1.773 & 0.011 & 0.273 \\
8 & (industry='Gold') Ʌ (marketCap='20.0 <= marketCap <= 21243025.198376004') & 1.692 & 0.013 & 0.636 \\
9 & (industry='Gold') Ʌ (country='Canada') & 1.673 & 0.016 & 0.636 \\
10 & (exchange='NGM') & 1.648 & 0.019 & 0.636 \\
11 & (industry='Apparel Retail') & 1.641 & 0.01 & 0.636 \\
12 & (country='Switzerland') & 1.614 & 0.016 & 0.727 \\
13 & (industry='Gold') Ʌ (ex

In [None]:
# Overview of variables

# config
# 
# # bs_result
# 
# beam_search.qualities
# beam_search.coverages
# beam_search.avg_quality
# beam_search.avg_coverage
# beam_search.max_quality
# beam_search.max_coverage
# beam_search.descriptions
# beam_search.duration
# beam_search.duration_quality
# ratio_not_quality_bs = (1- (beam_search.duration_quality/beam_search.duration))
# beam_search.count_quality
# 
# 
# # bottom_up.result
# 
# bottom_up.quals
# bottom_up.covs
# bottom_up.avg_quality
# bottom_up.avg_coverage
# bottom_up.max_quality
# bottom_up.max_coverage
# bottom_up.descriptions
# bottom_up.duration
# bottom_up.duration_quality
# ratio_not_quality_bus = (1- (bottom_up.duration_quality/bottom_up.duration))
# bottom_up.count_quality
# 
# 
# # monte_carlo.result
# 
# monte_carlo.qualities
# monte_carlo.coverages
# monte_carlo.avg_quality
# monte_carlo.avg_coverage
# monte_carlo.max_quality
# monte_carlo.max_coverage
# monte_carlo.descriptions
# monte_carlo.duration
# monte_carlo.duration_quality
# ratio_not_quality_ms = (1- (monte_carlo.duration_quality/monte_carlo.duration))
# monte_carlo.count_quality