# User Story 1
- User initializes several algorithm and dataset combinations
- the respective objects are created and substeps of the experiment pipeline are executed subsequently.
- Users algorithm choice is MR-Hydra, Weasel-V2 and QUANT (which are the best performing & most time efficient of their category). 
- Datasets involved are: ElectricDevices(10%) and LargeKitchenApplications
- Sometimes the user is interested in visualizing the data before applying DCA. Some performance metrics are visualized

In [3]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec
import pandas as pd
import warnings
import time
from tsml_eval.publications.y2023.tsc_bakeoff.run_experiments import _set_bakeoff_classifier

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [4]:
from src.basic_func import dataset_provider,dataset_overview, overview_of_bakeoff_cl
from src.apply_dca import apply_label_errors
from src.visualizations import visualize_acc_decr, visualize_trace_M


In [5]:
DATASET_NAME = "Beef"                   #should be in DS_list
CLASSIFIER_NAME = "Quant"            #should be in cl_ names
REDUCTION_F = 1                         #optional. only for large datasets
RANDOM_S = 0                            #Random Seed for everything except the DCA
DCA= "LabelErrors"                      #DCA Strategy Category --> Determines DoE_PARAM DICT
DoE_PARAM = {"le_strategy":"leV1", "random_seed":2,"start":0,"stop":90,"step":7,"p_vec":None}    #stop = max 90% of test_set_size, step=1-10 
EXP_FOLD = "simulation_results/"                            #respect folder structure
SAVE_FILES = True 
DATA_VIS  = False


In [6]:
# Classifier MR-Hydra, Dataset Beef 100%
current_ds, current_meta = dataset_provider(name=DATASET_NAME, reduction_factor=REDUCTION_F, test_set_ratio="default_benchmark", random_state=0)
#x_t, y_t = dataset_overview(train_test_dct=current_ds["y_train_small"] , dataset_name=DATASET_NAME)
current_cl = _set_bakeoff_classifier(CLASSIFIER_NAME, random_state=0, n_jobs=1)
cl_dict = {CLASSIFIER_NAME: current_cl}
df_, trace_M_= apply_label_errors(train_test_df=current_ds, cl_dict=cl_dict, ds_=DATASET_NAME,doe_param=DoE_PARAM)

X_train             : (30, 1, 470)
y_train             : (30,)
X_test              : (30, 1, 470)
y_test              : (30,)
X_train_small       : (30, 1, 470)
y_train_small       : (30,)
X_test_small        : (30, 1, 470)
y_test_small        : (30,)


[2025-05-31 13:07:29] INFO - requested_instance_step = 2.1 will be transformed into 2
[2025-05-31 13:07:29] INFO - label_names: ['1' '2' '3' '4' '5']
[2025-05-31 13:07:29] INFO - Current Label Error Strategy: DEFAULT: leV1
[2025-05-31 13:07:29] INFO - The p_vector for the current_experiment: [0.2, 0.2, 0.2, 0.2, 0.2]
[2025-05-31 13:07:29] INFO - Searching inside simulation_results/Quant/Beef for results
[2025-05-31 13:07:29] INFO - 📁 Found directories: ['leV1_1_0_27_1', 'leV1_2_0_16_2', 'leV1_2_0_28_2', 'leV1_2_0_28_4', 'leV1_1_0_3_1', 'leV1_0_0_27_1', 'leV1_0_0_3_1']
[2025-05-31 13:07:29] INFO - 🟡 Partial Match found: leV1_2_0_16_2
[2025-05-31 13:07:29] INFO - ✅ Exact match found: leV1_2_0_28_2


In [7]:
DATASET_NAME = "ElectricDevices"                   #should be in DS_list
CLASSIFIER_NAME = "Quant"            #should be in cl_ names
REDUCTION_F = 10                         #optional. only for large datasets
RANDOM_S = 0                            #Random Seed for everything except the DCA
DCA= "LabelErrors"                      #DCA Strategy Category --> Determines DoE_PARAM DICT
DoE_PARAM = {"le_strategy":"leV1", "random_seed":2,"start":0,"stop":25,"step":1,"p_vec":None}    #stop = max 90% of test_set_size, step=1-10 
EXP_FOLD = "simulation_results/"                            #respect folder structure
SAVE_FILES = True 
DATA_VIS  = False

In [None]:
# Classifier Quant, Dataset ED 10%
current_ds, current_meta = dataset_provider(name=DATASET_NAME, reduction_factor=REDUCTION_F, test_set_ratio="default_benchmark", random_state=0)
#x_t, y_t = dataset_overview(train_test_dct=current_ds["y_train_small"] , dataset_name=DATASET_NAME)
current_cl = _set_bakeoff_classifier(CLASSIFIER_NAME, random_state=0, n_jobs=1)
cl_dict = {CLASSIFIER_NAME: current_cl}
df_, trace_M_= apply_label_errors(train_test_df=current_ds, cl_dict=cl_dict, ds_=DATASET_NAME, doe_param=DoE_PARAM)

# Assumption: 
# If a file is an exact match the file will be obviously not trimmed.
# But for the case that this file is nevertheless filled with extended results 
# (e.g. current stop = 25 % -> file goes till 90% LE) errors may be possible?

[2025-05-31 13:13:27] INFO - requested_instance_step = 8.92 will be transformed into 9
[2025-05-31 13:13:27] INFO - label_names: ['1' '2' '3' '4' '5' '6' '7']
[2025-05-31 13:13:27] INFO - Current Label Error Strategy: DEFAULT: leV1
[2025-05-31 13:13:27] INFO - The p_vector for the current_experiment: [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429]
[2025-05-31 13:13:27] INFO - Searching inside simulation_results/Quant/ElectricDevices for results
[2025-05-31 13:13:27] INFO - 📁 Found directories: ['leV1_2_0_225_9', 'leV1_1_0_810_45', 'leV1_0_0_810_45', 'leV1_2_0_630_45', 'leV1_0_0_90_9']
[2025-05-31 13:13:27] INFO - ✅ Exact match found: leV1_2_0_225_9


X_train             : (8926, 1, 96)
y_train             : (8926,)
X_test              : (7711, 1, 96)
y_test              : (7711,)
X_train_small       : (892, 1, 96)
y_train_small       : (892,)
X_test_small        : (771, 1, 96)
y_test_small        : (771,)


In [11]:
df_

Unnamed: 0,step,LE_instances,LE_relative,accuracy,y_train_history,y_pred,y_pred_prob
0,0,0,0.0,0.7069,"[2, 7, 5, 4, 4, 7, 4, 2, 3, 2, 3, 5, 6, 4, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 4, 3, 5, 2, ...","[[0.0, 0.97, 0.0, 0.005, 0.025, 0.0, 0.0], [0...."
1,15,9,0.0101,0.7237,"[2, 7, 5, 4, 4, 7, 4, 2, 3, 2, 3, 5, 6, 4, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 4, 3, 5, 2, ...","[[0.0, 0.99, 0.0, 0.0, 0.01, 0.0, 0.0], [0.005..."
2,16,18,0.0202,0.7134,"[2, 7, 5, 4, 4, 7, 4, 2, 3, 2, 3, 5, 6, 4, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 5, 7, 5, 2, ...","[[0.0, 0.98, 0.0, 0.0, 0.02, 0.0, 0.0], [0.0, ..."
3,17,27,0.0303,0.7134,"[2, 7, 5, 4, 4, 7, 4, 2, 3, 2, 3, 5, 6, 4, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 4, 3, 5, 2, ...","[[0.0, 0.985, 0.0, 0.005, 0.01, 0.0, 0.0], [0...."
4,18,36,0.0404,0.7147,"[2, 7, 5, 5, 4, 7, 4, 2, 3, 2, 3, 5, 6, 2, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 4, 3, 5, 2, ...","[[0.0, 0.98, 0.0, 0.01, 0.01, 0.0, 0.0], [0.00..."
5,1,45,0.0495,0.7147,"[2, 7, 5, 4, 4, 7, 4, 2, 3, 2, 3, 5, 6, 4, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 5, 3, 5, 2, ...","[[0.0, 0.97, 0.0, 0.0, 0.025, 0.005, 0.0], [0...."
6,19,54,0.0605,0.7095,"[2, 7, 5, 4, 4, 7, 4, 2, 3, 2, 3, 5, 6, 4, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 4, 3, 5, 2, ...","[[0.0, 0.96, 0.0, 0.005, 0.03, 0.005, 0.0], [0..."
7,20,63,0.0706,0.6952,"[2, 7, 5, 4, 4, 7, 4, 2, 3, 2, 3, 5, 6, 4, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 5, 3, 5, 2, ...","[[0.0, 0.96, 0.0, 0.01, 0.03, 0.0, 0.0], [0.00..."
8,21,72,0.0807,0.7069,"[2, 7, 5, 4, 4, 7, 4, 2, 3, 2, 3, 5, 6, 4, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 4, 3, 5, 2, ...","[[0.005, 0.935, 0.0, 0.015, 0.03, 0.0, 0.015],..."
9,22,81,0.0908,0.7043,"[2, 7, 5, 4, 4, 7, 4, 2, 3, 2, 3, 5, 6, 4, 2, ...","[2, 5, 3, 2, 4, 4, 2, 2, 5, 5, 2, 4, 3, 5, 2, ...","[[0.0, 0.975, 0.0, 0.005, 0.02, 0.0, 0.0], [0...."


In [None]:
# Classifier Quant, Dataset ED
ds_ED, meta_ED = dataset_provider(name="ElectricDevices", reduction_factor=10, test_set_ratio="default_benchmark", random_state=0)
x_t, y_t = dataset_overview(train_test_dct=ds_ED["y_train_small"] , dataset_name="ElectricDevices0")
QUANT= _set_bakeoff_classifier("quant", random_state=0, n_jobs=1)
cl_dict = {"QUANT":QUANT}
DoE_PARAM = {"le_strategy":"leV1", "random_seed":0,"start":0,"stop":26,"step":1,"p_vec":None}
df_, trace_M_= apply_label_errors(train_test_df=current_ds, cl_dict=cl_dict, ds_=DATASET_NAME,doe_param=DoE_PARAM)

In [None]:
# Classifier Quant, Dataset ED
ds_ED, meta_ED = dataset_provider(name="ElectricDevices", reduction_factor=10, test_set_ratio="default_benchmark", random_state=0)
x_t, y_t = dataset_overview(train_test_dct=ds_ED["y_train_small"] , dataset_name="ElectricDevices0")
QUANT= _set_bakeoff_classifier("quant", random_state=0, n_jobs=1)
cl_dict = {"QUANT":QUANT}
df_ED_QUANT, trace_m_ED_QUANT = apply_label_errors(train_test_df=ds_ED, cl_dict=cl_dict, ds_="ElectricDevices",
                                                    stop=400, step=5, stop_percentage=0.8)

In [None]:
visualize_acc_decr(df_acc_inst_rel=df_ED_QUANT, dpi_=150, first="relative", second=None, w_=4.5, h_=3,
                   cl_="QUANT", ds_="ElectricDevices", save_fig=True)

In [None]:
visualize_trace_M(trace_M=trace_m_ED_QUANT, cl_="Quant", ds_="ED",dpi=200, filename_="trace_M", save_fig=False, exp_folder=None)

In [None]:
#Classifier Weasel-D, Dataset ED
ds_ED, meta_ED = dataset_provider(name="ElectricDevices", reduction_factor=10, test_set_ratio="default_benchmark", random_state=0)
x_t, y_t = dataset_overview(train_test_dct=ds_ED["y_train_small"] , dataset_name="ElectricDevices0")
Weasel_D= _set_bakeoff_classifier("Weasel-D", random_state=0, n_jobs=1)
cl_dict = {"Weasel-D":Weasel_D}
df_ED_W2, trace_m_ED_W2 = apply_label_errors(train_test_df=ds_ED, cl_dict=cl_dict, ds_="ElectricDevices",
                                                                stop=400, step=5, stop_percentage=0.8)

In [None]:
visualize_acc_decr(df_acc_inst_rel=df_ED_W2, dpi_=150, first="relative", second=None, w_=4.5, h_=3,
                   cl_="WEASEL-D", ds_="ElectricDevices", save_fig=True)

In [None]:
# Classifier Weasel-D, Dataset LKA
ds_LKA, meta_LKA = dataset_provider(name="LargeKitchenAppliances", reduction_factor=1, test_set_ratio="default_benchmark", random_state=0)
x_t, y_t = dataset_overview(train_test_dct=ds_LKA["y_train_small"] , dataset_name="LargeKitchenAppliances0")
Weasel_D = _set_bakeoff_classifier("Weasel-D", random_state=0, n_jobs=1)
cl_dict2 = {"Weasel-D": Weasel_D}
df_LKA_W2, res_LKA_W2, trace_m_LKA_W2 = apply_label_errors(train_test_df=ds_LKA, cl_dict=cl_dict2, ds_="LKA", stop=180, stop_percentage=0.7,  step=5)

In [None]:
visualize_acc_decr(df_acc_inst_rel=df_LKA_W2, dpi_=150, first="relative", second=None, w_=4.5, h_=3,
                   cl_="Weasel-D", ds_="LargeKitchenApplications", save_fig=True)

In [None]:
visualize_trace_M(trace_M=trace_m_LKA_W2, cl_="Weasel-D", ds_="LKA",dpi=200, filename_="trace_M", save_fig=False, exp_folder=None)