In [2]:
import pandas as pd

from getBenchmarkData import *

from Code.BeamSearch.BeamSearch import *
from Code.BUSPaS.BUSPaS import *
from Code.MCTS4DM.MCTS4DM import *

from Code.diversity import *

from datetime import datetime

In [5]:
def runE4():

    #Experiment parameters
    similarity_threshold = 0.7 #redundancy #TODO Check before run
    dataset_selection = 'all' #['voting'] #'benchmark_small' #'small_countries_only' # 'feasible' # 'all', 'feasible' (no sushi), 'benchmark_small' (no countries) etc.
    
    # Search Strategy parameters: General
    d = 3
    q = 20
    bins = 5
    min_cov = 0.05
    min_coverage_abs = 3
    correct_for_size = no_size_corr
    comparison_type = 'complement'
    
    # Search Strategy parameters: Beam Search
    w = 30
    min_error = 0.01 
    ensure_diversity = True #TODO double check and describe in paper
    
    # Search Strategy parameters: BUSPaS
    number_of_row_pairs = 100
    z = 3 #TODO Check if 2 or 3 is best
    
    # Search Strategy parameters: MCTS4DM
    max_time_minutes = None #is connected to BUSPaS duration
    ucb_type = 'SP-MCTS'
    ucb_params = {}
    roll_out_strategy =  'direct-freq'
    reward_policy = 'max_path'
    reward_policy_k = 3
    memory_policy = 'all'
    memory_policy_k_value =  3
    update_policy = 'max_update'
    update_policy_k = 3
    
    config = {
        'similarity_treshold': similarity_threshold,
        'dataset_selection': dataset_selection,
        'd':d,
        'q':q,
        'bins':bins,
        'min_cov':min_cov,
        'min_coverage_abs':min_coverage_abs,
        'correct_for_size':correct_for_size,
        'comparison_type':comparison_type,
        'w':w, 
        # Search Strategy parameters: Beam Search
        'min_error':min_error,
        'ensure_diversity':ensure_diversity,
        # Search Strategy parameters: BUSPaS
        'number_of_row_pairs':number_of_row_pairs,     
        'z':z,
        # Search Strategy parameters: MCTS4DM #is connected to BUSPaS duration
        'max_time_minutes':max_time_minutes,
        'ucb_type':ucb_type,
        'ucb_params':ucb_params,
        'roll_out_strategy':roll_out_strategy,
        'reward_policy':reward_policy,
        'reward_policy_k':reward_policy_k,
        'memory_policy':memory_policy,
        'memory_policy_k_value':memory_policy_k_value,
        'update_policy':update_policy,
        'update_policy_k':update_policy_k
    }
    
    #result set-up
    columns_log=['Dataset','Max Quality','Avg Quality','Max Coverage','Avg Coverage','Subgroups Checked','Time',
                                       'Ratio n.q.','Time per group','Redundancy','Diversity','Cover Redundancy','Configuration','Result','Object']
    columns_results=['Dataset','Max Quality','Avg Quality','Max Coverage','Avg Coverage','Subgroups Checked','Time',
                                       'Ratio n.q.','Time per group','Redundancy','Diversity','Cover Redundancy','Configuration']
    df_results_log = pd.DataFrame(columns=columns_log)
    df_results = pd.DataFrame(columns=columns_results)
    start_time_save = str(datetime.now().strftime('%Y%m%d%H%M'))
    df_results_log.to_pickle(f'./E4 results/results_{start_time_save}_log_{dataset_selection}.pkl')
    df_results.to_pickle(f'./E4 results/results_{start_time_save}_{dataset_selection}.pkl')
    
    #Get and convert data
    datasets = get_data(dataset_selection)
    
    keys = list(datasets.keys())
    for data_set_key in datasets.keys():
        
        data, target, matrix = datasets[data_set_key]
        data_copy = data.copy()
        
        data = convert_columns_num_to_cat(data,nr_chunks=bins)
        features = data.columns[:-1]
        
        print(f'Dataset= {data_set_key} ',keys.index(data_set_key)+1,'/',len(keys))
        
        ######### ----- ----- ##########
        # BUSPaS
        print('Start - BUSPaS at ', time.strftime('%Y-%m-%d %H:%M:%S'))
    
        bottom_up = BUSPaS(data,
            matrix,
            number_of_row_pairs=number_of_row_pairs,
            depth=d,
            q=q,
            z=z,
            nr_chunks=bins,
            min_coverage_perc=min_cov,
            min_coverage_abs=min_coverage_abs)
        # bottom_up.num_to_cat_attribute_converter()
        bottom_up.find_quality(quality_measure = cluster_based_quality_measure, comparison_type = comparison_type , size_corr = correct_for_size)
        
        time_per_group_bus = None if bottom_up.count_quality == 0 else round(bottom_up.duration/bottom_up.count_quality,3)
        
        redundancy_bus, result_local_optima_bus, _, _ = redundancy(bottom_up.result, data_copy, similarity_threshold=similarity_threshold, result_order_qdc=(0, 2, 1))
        diversity_bus = diversity(result_local_optima_bus, result_is_local_optima=True) 
        cover_redundancy_bus, _ = cover_redundancy(data_copy,bottom_up.result,result_order_qdc=(0, 2, 1))
        
        print('End - BUSPaS at ', time.strftime('%Y-%m-%d %H:%M:%S'))
        ######### ----- ----- ##########
        # Beam Search
        print('Start - Beam Search at ', time.strftime('%Y-%m-%d %H:%M:%S'))
        
        beam_search = BeamSearch(data)
        bs = beam_search.EMM( features,
             w=w,
             d=d,
             q=q,
             quality_measure=cluster_based_quality_measure,
             catch_all_description=[],
             comparison_type=comparison_type,
             target='target',
             n_chunks=bins,
             ensure_diversity=ensure_diversity,
             report_progress= False,
             allow_exclusion=False,
             min_coverage = min_cov,
             min_coverage_abs = min_coverage_abs,
             min_error = min_error,
             distance_matrix=matrix,
             correct_for_size=correct_for_size,
             show_result=False)
        bs_result = beam_search.result
        
        time_per_group_bs = None if beam_search.count_quality == 0 else round(beam_search.duration/beam_search.count_quality,3)
        
        redundancy_bs, result_local_optima_bs, _, _ = redundancy(bs_result, data_copy, similarity_threshold=similarity_threshold, result_order_qdc=(0, 3, 1))
        diversity_bs = diversity(result_local_optima_bs, result_is_local_optima=True) 
        cover_redundancy_bs, _ = cover_redundancy(data,bs_result,result_order_qdc=(0, 3, 1))
        
        print('End - Beam Search at ', time.strftime('%Y-%m-%d %H:%M:%S'))
        ######### ----- ----- ##########
        # MCTS4DM
        print('Start - MCTS4DM at ', time.strftime('%Y-%m-%d %H:%M:%S'))
        
        monte_carlo = MCTS4DM(data, 'target',
            q = q,
            root_description = [],
            n_chunks = bins,
            allow_exclusion = False,
            minutes = max_time_minutes if ((max_time_minutes is not None) and (max_time_minutes>0)) else bottom_up.duration/60, #TODO Discuss with Wouter
            max_nr_iterations = float('inf'),
            ucb_type=ucb_type, #or UCB1, UCB1-Tuned, SP-MCTS, UCT, DFS-UCT
            ucb_params=ucb_params,
            quality_params={},
            matrix = matrix,
            size_correction_method = correct_for_size,
            max_desc_length = d,
            min_coverage = min_cov,
            roll_out_strategy = roll_out_strategy, #or large-freq or naive
            reward_policy = reward_policy, #or random_pick, mean_path, mean_top_k
            reward_policy_k = reward_policy_k,
            memory_policy = memory_policy, #or top_k
            memory_policy_k_value = memory_policy_k_value,
            update_policy = update_policy, #or mean_update, top_k_mean_update
            update_policy_k = update_policy_k,
            show_progress = False,
            show_result=False)
        monte_carlo.run()
        
        time_per_group_mcts = None if monte_carlo.count_quality == 0 else round(monte_carlo.duration/monte_carlo.count_quality,3)
                     
        redundancy_mcts, result_local_optima_mcts, _, _ = redundancy(monte_carlo.result, data_copy, similarity_threshold=similarity_threshold, result_order_qdc=(0, 3, 1))
        diversity_mcts = diversity(result_local_optima_mcts, result_is_local_optima=True) 
        cover_redundancy_mcts, _ = cover_redundancy(data,monte_carlo.result,result_order_qdc=(0, 3, 1))
        
        print('End - MCTS4DM at ', time.strftime('%Y-%m-%d %H:%M:%S'))
        ######### ----- ----- ##########
        #Save results    
        df_results = pd.read_pickle(f'./E4 results/results_{start_time_save}_{dataset_selection}.pkl')
        df_results_log = pd.read_pickle(f'./E4 results/results_{start_time_save}_log_{dataset_selection}.pkl')
        len_df = len(df_results)
        new_rows_log = pd.DataFrame({
                            'Dataset':[data_set_key,data_set_key,data_set_key],
                            'Max Quality':[beam_search.max_quality,bottom_up.max_quality,monte_carlo.max_quality],
                            'Avg Quality':[beam_search.avg_quality,bottom_up.avg_quality,monte_carlo.avg_quality],
                            'Max Coverage':[beam_search.max_coverage,bottom_up.max_coverage,monte_carlo.max_coverage],
                            'Avg Coverage':[beam_search.avg_coverage,bottom_up.avg_coverage,monte_carlo.avg_coverage],
                            'Subgroups Checked':[beam_search.count_quality,bottom_up.count_quality,monte_carlo.count_quality],
                            'Time':[beam_search.duration,bottom_up.duration,monte_carlo.duration],
                            'Ratio n.q.':[(1- (beam_search.duration_quality/beam_search.duration)),(1- (bottom_up.duration_quality/bottom_up.duration)),(1- (monte_carlo.duration_quality/monte_carlo.duration))],
                            'Time per group':[time_per_group_bs,time_per_group_bus,time_per_group_mcts],
                            'Redundancy':[redundancy_bs,redundancy_bus,redundancy_mcts],
                            'Diversity':[diversity_bs,diversity_bus,diversity_mcts],
                            'Cover Redundancy':[cover_redundancy_bs,cover_redundancy_bus,cover_redundancy_mcts],
                            'Configuration':[config,config,config],
                            'Result':[bs_result,bottom_up.result,monte_carlo.result],
                            'Object':[beam_search,bottom_up,monte_carlo]},
                           index=[f'Beam Search {len_df/3+1}',f'BUSPaS {len_df/3+1}',f'MCTS4DM {len_df/3+1}'])
        df_results_log = pd.concat([df_results_log, new_rows_log])
        
        new_rows = {key: value for key, value in new_rows_log.items() if key not in ['Object']}
        df_results = pd.concat([df_results, pd.DataFrame(new_rows)])
        
        #saves the results excl. objects and results list
        df_results.to_pickle(f'./E4 results/results_{start_time_save}_{dataset_selection}.pkl')
        #saves the results incl. objects and results list
        df_results_log.to_pickle(f'./E4 results/results_{start_time_save}_log_{dataset_selection}.pkl')
    
    df_results_latex = df_results.round(3).astype(str)
    print(df_results_latex[['Dataset','Max Quality','Max Coverage','Subgroups Checked','Time','Ratio n.q.','Redundancy','Diversity','Cover Redundancy']].to_latex())
    return df_results #[['Max Quality','Avg Quality','Max Coverage','Avg Coverage','Subgroups Checked','Time','Ratio n.q.']]
    
    

In [6]:
runE4()


Dataset= wisconsin  1 / 18
Start - BUSPaS at  2025-01-28 21:45:46
End - BUSPaS at  2025-01-28 21:45:50
Start - Beam Search at  2025-01-28 21:45:50
End - Beam Search at  2025-01-28 21:45:58
Start - MCTS4DM at  2025-01-28 21:45:58
End - MCTS4DM at  2025-01-28 21:46:02
Dataset= cpu  2 / 18
Start - BUSPaS at  2025-01-28 21:46:02


  df_results_log = pd.concat([df_results_log, new_rows_log])
  df_results = pd.concat([df_results, pd.DataFrame(new_rows)])


End - BUSPaS at  2025-01-28 21:46:12
Start - Beam Search at  2025-01-28 21:46:12
End - Beam Search at  2025-01-28 21:46:38
Start - MCTS4DM at  2025-01-28 21:46:38
End - MCTS4DM at  2025-01-28 21:46:48
Dataset= voting  3 / 18
Start - BUSPaS at  2025-01-28 21:46:48
End - BUSPaS at  2025-01-28 21:51:17
Start - Beam Search at  2025-01-28 21:51:17
End - Beam Search at  2025-01-28 21:55:33
Start - MCTS4DM at  2025-01-28 21:55:33
End - MCTS4DM at  2025-01-28 22:00:02
Dataset= student_math  4 / 18
Start - BUSPaS at  2025-01-28 22:00:04
End - BUSPaS at  2025-01-28 22:00:22
Start - Beam Search at  2025-01-28 22:00:22
End - Beam Search at  2025-01-28 22:00:39
Start - MCTS4DM at  2025-01-28 22:00:39
End - MCTS4DM at  2025-01-28 22:00:58
Dataset= student_por  5 / 18
Start - BUSPaS at  2025-01-28 22:01:02
End - BUSPaS at  2025-01-28 22:01:23
Start - Beam Search at  2025-01-28 22:01:23
End - Beam Search at  2025-01-28 22:01:40
Start - MCTS4DM at  2025-01-28 22:01:40
End - MCTS4DM at  2025-01-28 22:02

Unnamed: 0,Dataset,Max Quality,Avg Quality,Max Coverage,Avg Coverage,Subgroups Checked,Time,Ratio n.q.,Time per group,Redundancy,Diversity,Cover Redundancy,Configuration,Result
Beam Search 1.0,wisconsin,0.865,0.852,0.108,0.067,814,7.817695,0.962694,0.01,0.95,0.865218,1.67272,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(0.865218, 0.061855670103092786, 588, [A2 == ..."
BUSPaS 1.0,wisconsin,0.856,0.84698,0.108,0.064175,339,3.763659,0.954897,0.011,0.65,5.941373,1.684967,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(0.85592955, 0.05670103092783505, [-1.42093 <..."
MCTS4DM 1.0,wisconsin,0.854,0.811,0.201,0.082,668,3.777287,0.966672,0.006,0.95,0.853701,1.361735,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(0.85370123, 0.05154639175257732, 227, [A10 =..."
Beam Search 2.0,cpu,0.7,0.666,0.201,0.107,236,25.955097,0.353281,0.11,0.95,0.699613,1.558051,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(0.69961315, 0.098388671875, 162, [A6 == '-0...."
BUSPaS 2.0,cpu,0.512,0.510289,0.201,0.200378,2,9.842319,0.971176,4.921,0.0,1.020578,1.329102,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(0.51172745, 0.20068359375, [-0.34309 <= A5 <..."
MCTS4DM 2.0,cpu,0.684,0.632,0.203,0.117,136,9.874204,0.073676,0.073,0.95,0.684218,1.264939,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(0.6842184, 0.1649169921875, 53, [A5 == '-0.8..."
Beam Search 3.0,voting,1.214,1.188,0.068,0.054,3549,255.719989,0.979277,0.072,0.95,1.213951,1.445464,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(1.2139509, 0.05128205128205128, 2518, [Migra..."
BUSPaS 3.0,voting,1.214,1.192029,0.074,0.05641,16987,268.378578,0.962576,0.016,0.35,15.511245,1.429479,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(1.2139509, 0.05128205128205128, [0.2909699 <..."
MCTS4DM 3.0,voting,1.198,1.162,0.068,0.058,8180,268.428211,0.955965,0.033,0.95,1.198384,1.476931,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(1.1983844, 0.05128205128205128, 856, [Migrat..."
Beam Search 4.0,student_math,0.419,0.395,0.084,0.058,1892,17.017185,0.922805,0.009,0.95,0.418766,1.648676,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(0.41876644, 0.053164556962025315, 1413, [fai..."


In [None]:
df_results

## Analysis

In [10]:
df_results = pd.read_pickle(f'./E4 results/results_{start_time_save}_{dataset_selection}.pkl').round(3).astype(str)
print(df_results_latex[['Dataset','Max Quality','Max Coverage','Subgroups Checked','Time','Ratio n.q.','Redundancy','Diversity','Cover Redundancy']].to_latex())
df_results

\begin{tabular}{llllllllll}
\toprule
 & Dataset & Max Quality & Max Coverage & Subgroups Checked & Time & Ratio n.q. & Redundancy & Diversity & Cover Redundancy \\
\midrule
Beam Search 1.0 & wisconsin & 0.865 & 0.108 & 567 & 7.078 & 0.965 & 0.95 & 0.865 & 1.673 \\
BUSPaS 1.0 & wisconsin & 0.865 & 0.108 & 538 & 4.17 & 0.968 & 0.65 & 5.98 & 1.646 \\
MCTS4DM 1.0 & wisconsin & 0.855 & 0.201 & 451 & 4.192 & 0.947 & 0.95 & 0.855 & 0.874 \\
Beam Search 1.0 & wisconsin & 0.865 & 0.108 & 567 & 7.078 & 0.965 & 0.95 & 0.865 & 1.673 \\
BUSPaS 1.0 & wisconsin & 0.865 & 0.108 & 538 & 4.17 & 0.968 & 0.65 & 5.98 & 1.646 \\
MCTS4DM 1.0 & wisconsin & 0.855 & 0.201 & 451 & 4.192 & 0.947 & 0.95 & 0.855 & 0.874 \\
Beam Search 2.0 & cpu & 0.7 & 0.201 & 170 & 29.661 & 0.342 & 0.95 & 0.7 & 1.558 \\
BUSPaS 2.0 & cpu & 0.517 & 0.202 & 7 & 16.042 & 0.923 & 0.0 & 3.583 & 0.741 \\
MCTS4DM 2.0 & cpu & 0.678 & 0.247 & 167 & 16.042 & 0.148 & 0.95 & 0.678 & 0.986 \\
Beam Search 1.0 & wisconsin & 0.865 & 0.108 & 567 & 

Unnamed: 0,Dataset,Max Quality,Avg Quality,Max Coverage,Avg Coverage,Subgroups Checked,Time,Ratio n.q.,Time per group,Redundancy,Diversity,Cover Redundancy,Configuration,Result
Beam Search 1.0,wisconsin,0.865,0.852,0.108,0.066,567,7.078,0.965,0.012,0.95,0.865,1.673,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(np.float32(0.865218), 0.061855670103092786, ..."
BUSPaS 1.0,wisconsin,0.865,0.851,0.108,0.065,538,4.17,0.968,0.008,0.65,5.98,1.646,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(np.float32(0.865218), 0.061855670103092786, ..."
MCTS4DM 1.0,wisconsin,0.855,0.776,0.201,0.114,451,4.192,0.947,0.009,0.95,0.855,0.874,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(np.float32(0.85470635), 0.061855670103092786..."
Beam Search 1.0,wisconsin,0.865,0.852,0.108,0.066,567,7.078,0.965,0.012,0.95,0.865,1.673,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(np.float32(0.865218), 0.061855670103092786, ..."
BUSPaS 1.0,wisconsin,0.865,0.851,0.108,0.065,538,4.17,0.968,0.008,0.65,5.98,1.646,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(np.float32(0.865218), 0.061855670103092786, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BUSPaS 121.0,Brazil,2.688,1.812,0.195,0.088,328,5.349,0.931,0.016,0.25,25.632,0.829,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(np.float32(2.6875854), 0.05511811023622047, ..."
MCTS4DM 121.0,Brazil,2.688,1.846,0.199,0.1,160,5.35,0.972,0.033,0.75,9.761,0.691,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(np.float32(2.6875854), 0.05511811023622047, ..."
Beam Search 137.0,large_economies,6.173,4.877,0.149,0.069,162,162.105,0.406,1.001,0.8,19.261,1.059,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(np.float32(6.1734114), 0.05025940337224384, ..."
BUSPaS 137.0,large_economies,4.893,3.838,0.306,0.116,97,151.579,0.481,1.563,0.25,58.5,0.887,"{'similarity_treshold': 0.7, 'dataset_selectio...","[(np.float32(4.8928556), 0.10905750108084739, ..."


In [6]:
pd.read_pickle(f'./E4 results/E4 for analysis/results_{start_time_save}_{dataset_selection}.pkl')

FileNotFoundError: [Errno 2] No such file or directory: './E4 results/E4 for analysis/results_202501111827_feasible.pkl'

In [None]:
df_results = pd.read_pickle(f'./E4 results/E4 for analysis/results_{start_time_save}_log_{dataset_selection}.pkl')['Configuration']

In [None]:
df_results[5]

## Archive