In [5]:
import numpy as np
import pandas as pd
from datetime import datetime

from getBenchmarkData import *

from Code.BeamSearch.BeamSearch import *
from Code.BUSPaS.BUSPaS import *
from Code.MCTS4DM.MCTS4DM import *

from Code.diversity import *

import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import time

import ast

In [6]:
def runE1u(numb=500):
    u = numb
    q = numb
    
    
    # Search Strategy parameters: General
    max_time_limit_minutes = 5
    experiment_start_time = datetime.now().strftime("%Y%m%d%H%M%S")
    save_title = f'E1.2 results u parameter {experiment_start_time}'
    d = 3
    q = q
    bins = 5
    min_cov = 0.05
    min_coverage_abs = 3
    correct_for_size = no_size_corr
    comparison_type = 'complement'
    
    # Search Strategy parameters: BUSPaS
    number_of_row_pairs = u #u-parameter
    
    #Experiment parameters
    z = 2
    similarity_threshold = 0.5 #redundancy
    dataset_selection = 'all' #TODO specify # 'all', 'feasible' (no sushi), 'benchmark_small' (no countries) etc.
    
    config = {
        'max_time_limit_minutes' : max_time_limit_minutes,
        'save_title' : save_title,
        'z' : z,
        'd' : d,
        'q' : q,
        'bins' : bins,
        'min_cov' : min_cov,
        'min_coverage_abs' : min_coverage_abs,
        'correct_for_size' : correct_for_size,
        'comparison_type' : comparison_type,
        'similarity_threshold' : similarity_threshold,
        'dataset_selection' : dataset_selection}
    
    #result set-up
    results = pd.DataFrame(columns=['dataset_name',
                                    'u',
                                    'max_quality',
                                    'avg_quality',
                                    'max_coverage',
                                    'avg_coverage',
                                    'duration',
                                    'count_quality',
                                    'time_per_group',
                                    'redundancy',
                                    'diversity',
                                    'results_descriptions',
                                    'results_non_redundant',
                                    ])
    
    datasets = get_data(dataset_selection)
    # del datasets['voting']

    keys = list(datasets.keys())
    print(keys)
    
    for data_set_key in datasets.keys():
    
        data, target, matrix = datasets[data_set_key]
        data_copy = data.copy()
        
        data = convert_columns_num_to_cat(data,nr_chunks=bins)
        features = data.columns[:-1]
        
        print('Dataset ',keys.index(data_set_key)+1,'/',len(keys), ' ', data_set_key)
        print('Start - BUSPaS at ', time.strftime('%Y-%m-%d %H:%M:%S'))
        #run BUSPaS
        bottom_up = BUSPaS(data,
            matrix,
            number_of_row_pairs=number_of_row_pairs,
            depth=d,
            q=q,
            z=z,
            nr_chunks=bins,
            min_coverage_perc=min_cov,
            min_coverage_abs=min_coverage_abs)
        bottom_up.num_to_cat_attribute_converter()
        print('bus initiated')
        
        bottom_up.find_quality(quality_measure = cluster_based_quality_measure, comparison_type = comparison_type , size_corr = correct_for_size, minutes = max_time_limit_minutes)
        print('End - BUSPaS at ', time.strftime('%Y-%m-%d %H:%M:%S'))
        
        print('bus finished')

        #get and save evaluation metrics
        redundancy_score, result_local_optima, _, _ = redundancy(bottom_up.result, data_copy,similarity_threshold=similarity_threshold,result_order_qdc=(0, 2, 1)) 
        diversity_score = diversity(result_local_optima, result_is_local_optima=True)
        
        result_local_optima_descriptions = [i[1] for i in result_local_optima]
        
        results.loc[len(results.index)] = [
            data_set_key,
            number_of_row_pairs,
            bottom_up.max_quality,
            bottom_up.avg_quality,
            bottom_up.max_coverage,
            bottom_up.avg_coverage,
            bottom_up.duration,
            bottom_up.count_quality,
            None if bottom_up.count_quality == 0 else round(bottom_up.duration/bottom_up.count_quality,3),
            redundancy_score,
            diversity_score,
            bottom_up.descriptions,
            result_local_optima_descriptions]
        
        print('bus analyzed')
        
        results.to_pickle(f'./E1 results/{save_title}.pkl')
        np.save(f'./E1 results/{save_title}_config.npy',config)
        
        print('bus saved')

In [7]:
runE1u()

['wisconsin', 'cpu', 'voting', 'student_math', 'student_por', 'elevators', 'ecoli', 'wine', 'sushi', 'Netherlands', 'France', 'Germany', 'India', 'Australia', 'South Korea', 'Indonesia', 'Brazil', 'large_economies']
Dataset  1 / 18   wisconsin
Start - BUSPaS at  2025-01-27 16:05:06
bus initiated
End - BUSPaS at  2025-01-27 16:05:31
bus finished
bus analyzed
bus saved
Dataset  2 / 18   cpu
Start - BUSPaS at  2025-01-27 16:05:32
bus initiated
End - BUSPaS at  2025-01-27 16:05:51
bus finished
bus analyzed
bus saved
Dataset  3 / 18   voting
Start - BUSPaS at  2025-01-27 16:05:51
bus initiated
End - BUSPaS at  2025-01-27 17:40:44
bus finished
bus analyzed
bus saved
Dataset  4 / 18   student_math
Start - BUSPaS at  2025-01-27 17:40:48
bus initiated
End - BUSPaS at  2025-01-27 17:43:04
bus finished
bus analyzed
bus saved
Dataset  5 / 18   student_por
Start - BUSPaS at  2025-01-27 17:43:06
bus initiated
End - BUSPaS at  2025-01-27 17:45:06
bus finished
bus analyzed
bus saved
Dataset  6 / 18   