In [1]:
import numpy as np
import pandas as pd

from getBenchmarkData import *

from Code.BeamSearch.BeamSearch import *
from Code.BUSPaS.BUSPaS import *
from Code.MCTS4DM.MCTS4DM import *

from Code.diversity import *

import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, TimeoutError
import time

import ast

In [2]:
def runE1z(theta=None):

    # Search Strategy parameters: General
    max_time_limit_minutes = 5
    save_title = f'E1 results z parameter 20250111 1525 theta {theta} sushi only'
    d = 3
    q = 20
    bins = 5
    min_cov = 0.05
    min_coverage_abs = 3
    correct_for_size = no_size_corr
    comparison_type = 'complement'
    
    # Search Strategy parameters: BUSPaS
    number_of_row_pairs = 100 #u-parameter
    
    #Experiment parameters
    max_z = 8 #varying z 
    similarity_threshold = theta #redundancy
    dataset_selection = 'sushi' #TODO specify # 'all', 'feasible' (no sushi), 'benchmark_small' (no countries) etc.
    
    config = {
        'max_time_limit_minutes' : max_time_limit_minutes,
        'save_title' : save_title,
        'd' : d,
        'q' : q,
        'bins' : bins,
        'min_cov' : min_cov,
        'min_coverage_abs' : min_coverage_abs,
        'correct_for_size' : correct_for_size,
        'comparison_type' : comparison_type,
        'number_of_row_pairs' : number_of_row_pairs,
        'similarity_threshold' : similarity_threshold,
        'dataset_selection' : dataset_selection}
    
    #result set-up
    results = pd.DataFrame(columns=['dataset_name',
                                    'z',
                                    'max_quality',
                                    'avg_quality',
                                    'max_coverage',
                                    'avg_coverage',
                                    'duration',
                                    'count_quality',
                                    'time_per_group',
                                    'redundancy',
                                    'diversity',
                                    ])
    
    datasets = get_data(dataset_selection)

    keys = list(datasets.keys())
    print(keys)
    
    for data_set_key in datasets.keys():
    
        data, target, matrix = datasets[data_set_key]
        data_copy = data.copy()
        
        data = convert_columns_num_to_cat(data,nr_chunks=bins)
        features = data.columns[:-1]
        
        #choosing parameter-z - the first z=3 will not be saved as a result - to improve fairness of experiments 
        z_s = [3]+list(range(2,max_z+1))
        for i in range(len(z_s)):
            z = z_s[i]
            
            print('Dataset ',keys.index(data_set_key)+1,'/',len(keys),' z=',z)
            
            #run BUSPaS
            bottom_up = BUSPaS(data,
                matrix,
                number_of_row_pairs=number_of_row_pairs,
                depth=d,
                q=q,
                z=z,
                nr_chunks=bins,
                min_coverage_perc=min_cov,
                min_coverage_abs=min_coverage_abs)
            bottom_up.num_to_cat_attribute_converter()
            print('bus initiated')
            
            bottom_up.find_quality(quality_measure = cluster_based_quality_measure, comparison_type = comparison_type , size_corr = correct_for_size, minutes = max_time_limit_minutes)
            
            print('bus finished')
    
            if i>0:
                #get and save evaluation metrics
                redundancy_score, result_local_optima, _, _ = redundancy(bottom_up.result, data_copy,similarity_threshold=similarity_threshold,result_order_qdc=(0, 2, 1)) 
                diversity_score = diversity(result_local_optima, result_is_local_optima=True)
                
                results.loc[len(results.index)] = [
                    data_set_key,
                    z,
                    bottom_up.max_quality,
                    bottom_up.avg_quality,
                    bottom_up.max_coverage,
                    bottom_up.avg_coverage,
                    bottom_up.duration,
                    bottom_up.count_quality,
                    None if bottom_up.count_quality == 0 else round(bottom_up.duration/bottom_up.count_quality,3),
                    redundancy_score,
                    diversity_score]
                
                print('bus analyzed')
                
                results.to_pickle(f'./E1 results/{save_title}.pkl')
                np.save(f'./E1 results/{save_title}_config.npy',config)
                
                print('bus saved')

In [5]:
runE1z(theta=0.5)
runE1z(theta=0.7)

['sushi']
Dataset  1 / 1  z= 3
bus initiated
bus finished
Dataset  1 / 1  z= 2
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 3
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 4
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 5
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 6
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 7
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 8
bus initiated
bus finished
bus analyzed
bus saved
['sushi']
Dataset  1 / 1  z= 3
bus initiated
bus finished
Dataset  1 / 1  z= 2
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 3
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 4
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 5
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 6
bus initiated
bus finished
bus analyzed
bus saved
Dataset  1 / 1  z= 7
bus initiat

### Archief

In [51]:
# datasets = get_data(dataset_selection)
# 
# keys = list(datasets.keys())
# for data_set_key in datasets.keys():
#     data, target, matrix = datasets[data_set_key]
#     data_copy = data.copy()
#     
#     data = convert_columns_num_to_cat(data,nr_chunks=bins)
#     features = data.columns[:-1]
#     
#     print('dataset ',data_set_key)
#     print('columns ',len(data.columns))
#     print('rows ',len(data))
#     print(' ')

dataset  wisconsin
columns  17
rows  194
 
dataset  cpu
columns  7
rows  8192
 
dataset  voting
columns  84
rows  351
 
dataset  student_math
columns  31
rows  395
 
dataset  student_por
columns  31
rows  649
 
dataset  elevators
columns  10
rows  16599
 
dataset  ecoli
columns  8
rows  336
 
dataset  wine
columns  14
rows  178
 
dataset  Netherlands
columns  12
rows  213
 
dataset  France
columns  12
rows  907
 
dataset  Germany
columns  12
rows  1114
 
dataset  India
columns  12
rows  1274
 
dataset  Australia
columns  12
rows  1601
 
dataset  South Korea
columns  12
rows  662
 
dataset  Indonesia
columns  12
rows  626
 
dataset  Brazil
columns  12
rows  508
 
dataset  large_economies
columns  12
rows  18504
 


In [52]:
# pd.read_pickle(f'./E1 results/{save_title}.pkl')

Unnamed: 0,dataset_name,z,max_quality,avg_quality,max_coverage,avg_coverage,duration,count_quality,time_per_group,redundancy,diversity
0,wisconsin,2,0.865,0.850509,0.108,0.065464,7.684881,870,0.009,0.7500,4.278607
1,wisconsin,3,0.856,0.846980,0.108,0.064175,1.674861,339,0.005,0.7500,4.239572
2,wisconsin,4,0.856,0.822749,0.201,0.117526,1.106489,167,0.007,0.8000,3.335898
3,wisconsin,5,0.819,0.797354,0.201,0.118814,1.317301,114,0.012,0.7000,4.769419
4,wisconsin,6,0.815,0.786767,0.201,0.118041,2.405181,81,0.030,0.7500,3.96024
...,...,...,...,...,...,...,...,...,...,...,...
114,large_economies,4,3.921,1.886842,0.438,0.274116,66.771146,16,4.173,0.5625,18.883657
115,large_economies,5,3.921,1.544665,0.438,0.319423,65.131963,10,6.513,0.6000,10.873236
116,large_economies,6,3.663,1.572967,0.438,0.401232,79.558416,5,15.912,0.4000,6.654968
117,large_economies,7,3.663,1.372720,0.438,0.404521,183.856156,4,45.964,0.5000,4.281013
