# Compiling the hyperparameters search data

The objective is to generate 2 csvs:
- hs_max.csv: has all the max values from the hyperparameters search
- hs_all.csv: has all the values from the hyperparameters search

In [184]:
import yaml
import pandas as pd
import os

## Data without reducer

In [185]:
datasets = ['kuhar', 'motionsense', 'uci', 'wisdm', 'realworld_thigh', 'realworld_waist']

In [186]:
data = []
for dataset in datasets:
    try:
        no_reducer_experiment = f'../execute_once_experiments/_previous/TV_sb_no_reducer/scores/no_reducer_{dataset}.yaml'
        with open(no_reducer_experiment) as f:
            no_reducer_score = yaml.load(f, Loader=yaml.FullLoader)['score']
    except:
        no_reducer_score = 0
        print(f'No reducer score not found for {dataset}')
    new_val = {'dataset': dataset, 'model': 'no_reducer', 'percent': 100, 'dim': 360, 'score': no_reducer_score}
    data.append(new_val)

In [187]:
df = pd.DataFrame(data)
df

Unnamed: 0,dataset,model,percent,dim,score
0,kuhar,no_reducer,100,360,0.671127
1,motionsense,no_reducer,100,360,0.785952
2,uci,no_reducer,100,360,0.847059
3,wisdm,no_reducer,100,360,0.747373
4,realworld_thigh,no_reducer,100,360,0.671521
5,realworld_waist,no_reducer,100,360,0.787217


In [188]:
df.to_csv('TV_no_reducer_scores.csv', index=False)

In [189]:
data = []
for dataset in datasets:
    try:
        no_reducer_experiment = f'../execute_once_experiments/TVT_sb_tdom_no_reducer/scores/TVT_sb_no_reducer_{dataset}.yaml'
        with open(no_reducer_experiment) as f:
            no_reducer_score = yaml.load(f, Loader=yaml.FullLoader)['score']
    except:
        no_reducer_score = 0
        print(f'No reducer score not found for {dataset}')
    new_val = {'dataset': dataset, 'model': 'no_reducer', 'percent': 100, 'dim': 360, 'score': no_reducer_score}
    data.append(new_val)

In [190]:
df = pd.DataFrame(data)
df

Unnamed: 0,dataset,model,percent,dim,score
0,kuhar,no_reducer,100,360,0.796528
1,motionsense,no_reducer,100,360,0.887288
2,uci,no_reducer,100,360,0.885507
3,wisdm,no_reducer,100,360,0.769399
4,realworld_thigh,no_reducer,100,360,0.695997
5,realworld_waist,no_reducer,100,360,0.663233


In [191]:
df.to_csv('TVT_no_reducer_scores.csv', index=False)

## Max values per hyperparameter search

In [192]:
data = []

In [193]:
models = ['umap', 'ae', 'tae', 'convae', 'convtae']
percentages = [2.5, 5, 25, 50, 75, 100, 200]

In [194]:
for dataset in datasets:
    value = {'dataset': dataset, 'model': 'no_reducer', 'percent': 100, 'dim': 0, 'score': no_reducer_score}
    for model in models:
        value['model'] = model
        percentage_data = []
        dim_col_name = 'config/umap_ncomp' if model == 'umap' else 'config/latent_dim' 
        for percentage in percentages:
            value['percent'] = percentage
            experiment_name = f'P10_{model}_{dataset}_p{percentage}'
            if model == 'umap':
                experiment_name = experiment_name[4:]
            try:
                experiment_data = pd.read_csv(f'../experiments/{experiment_name}/data.csv')
                max_row = experiment_data.loc[experiment_data['score'].idxmax()]
                data_max_value = max_row['score']
                # print(max_row['score'], experiment_data['score'].max())
                data_dim = max_row[dim_col_name]
                # data_max_value = experiment_data['score'].max()
            except:
                data_max_value = 0
                print(f'No data for {experiment_name}')
            value['dim'] = data_dim
            value['score'] = data_max_value
            data.append(value.copy())

In [195]:
pd.DataFrame(data)

Unnamed: 0,dataset,model,percent,dim,score
0,kuhar,umap,2.5,7,0.424883
1,kuhar,umap,5.0,18,0.424883
2,kuhar,umap,25.0,89,0.429577
3,kuhar,umap,50.0,53,0.420188
4,kuhar,umap,75.0,51,0.422535
...,...,...,...,...,...
205,realworld_waist,convtae,25.0,21,0.834951
206,realworld_waist,convtae,50.0,32,0.840345
207,realworld_waist,convtae,75.0,12,0.821467
208,realworld_waist,convtae,100.0,136,0.831499


In [196]:
data = pd.DataFrame(data)
data.to_csv('hs_max.csv', index=False)

### Now, how do they behave with test data?

In [197]:
data = data.set_index(['dataset', 'model', 'percent']).sort_index()
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dim,score
dataset,model,percent,Unnamed: 3_level_1,Unnamed: 4_level_1
kuhar,ae,2.5,9,0.600235
kuhar,ae,5.0,16,0.646244
kuhar,ae,25.0,74,0.654695
kuhar,ae,50.0,72,0.663615
kuhar,ae,75.0,198,0.665258
...,...,...,...,...
wisdm,umap,25.0,55,0.627458
wisdm,umap,50.0,144,0.638220
wisdm,umap,75.0,210,0.629068
wisdm,umap,100.0,291,0.642627


In [198]:
data.loc[('kuhar', 'ae', 2.5),'score'] = 99
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dim,score
dataset,model,percent,Unnamed: 3_level_1,Unnamed: 4_level_1
kuhar,ae,2.5,9,99.000000
kuhar,ae,5.0,16,0.646244
kuhar,ae,25.0,74,0.654695
kuhar,ae,50.0,72,0.663615
kuhar,ae,75.0,198,0.665258
...,...,...,...,...
wisdm,umap,25.0,55,0.627458
wisdm,umap,50.0,144,0.638220
wisdm,umap,75.0,210,0.629068
wisdm,umap,100.0,291,0.642627


In [199]:
# For every row in data, save a file with the experiment name and the row data
for index, row in data.iterrows():
    # print(index, row)
    d_val = index[0]
    m_val = index[1]
    p_val = index[2]
    if p_val != 2.5:
        p_val = int(p_val)
    experiment_name = f'../execute_once_experiments/TVT_sb_best_found_2024/scores/TVT_sb_{m_val}_{d_val}_P{p_val}.yaml'
    with open(experiment_name, 'r') as f:
        experiment_data = yaml.load(f, Loader=yaml.FullLoader)
        data.loc[index, 'score'] = experiment_data['score']
        # print(experiment_data['score'])

In [200]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,dim,score
dataset,model,percent,Unnamed: 3_level_1,Unnamed: 4_level_1
kuhar,ae,2.5,9,0.612500
kuhar,ae,5.0,16,0.673611
kuhar,ae,25.0,74,0.638889
kuhar,ae,50.0,72,0.715278
kuhar,ae,75.0,198,0.666667
...,...,...,...,...
wisdm,umap,25.0,55,0.600924
wisdm,umap,50.0,144,0.610324
wisdm,umap,75.0,210,0.608351
wisdm,umap,100.0,291,0.594515


In [201]:
data.to_csv('hs_max_TVT.csv')

### Now, how do they behave with test data? (with pydrm)

In [202]:
# For every row in data, save a file with the experiment name and the row data
for index, row in data.iterrows():
    # print(index, row)
    d_val = index[0]
    m_val = index[1]
    p_val = index[2]
    if p_val != 2.5:
        p_val = int(p_val)
    # Pydrm 1
    experiment_name = f'../execute_once_experiments/TVT_sb_best_found_2024_pydrm/scores/TVT_sb_{m_val}_{d_val}_P{p_val}.yaml'
    try:
        with open(experiment_name, 'r') as f:
            experiment_data = yaml.load(f, Loader=yaml.FullLoader)
            data.loc[index, 'score'] = experiment_data['score']
        exp_result_file = f'../execute_once_experiments/TVT_sb_best_found_2024_pydrm/results/TVT_sb_{m_val}_{d_val}_P{p_val}.yaml'
        with open(exp_result_file, 'r') as f:
            experiment_data = yaml.load(f, Loader=yaml.FullLoader)
            data.loc[index, 'CONT'] = experiment_data['additional']['pydrm_report']['continuity']
            data.loc[index, 'TRUST'] = experiment_data['additional']['pydrm_report']['trustworthiness']
            data.loc[index, 'LCMC'] = experiment_data['additional']['pydrm_report']['local continuity meta criterion']
    except:
        print(f'No data for {experiment_name}')
        data.loc[index, 'score'] = 0
        data.loc[index, 'CONT'] = 0
        data.loc[index, 'TRUST'] = 0
        data.loc[index, 'LCMC'] = 0
    # Pydrm 2
    try:
        experiment_name = f'../execute_once_experiments/TVT_sb_best_found_2024_pydrm_2/scores/TVT_sb_{m_val}_{d_val}_P{p_val}.yaml'
        with open(experiment_name, 'r') as f:
            experiment_data = yaml.load(f, Loader=yaml.FullLoader)
            data.loc[index, 'score2'] = experiment_data['score']
        exp_result_file = f'../execute_once_experiments/TVT_sb_best_found_2024_pydrm_2/results/TVT_sb_{m_val}_{d_val}_P{p_val}.yaml'
        with open(exp_result_file, 'r') as f:
            experiment_data = yaml.load(f, Loader=yaml.FullLoader)
            data.loc[index, 'CONT2'] = experiment_data['additional']['pydrm_report']['continuity']
            data.loc[index, 'TRUST2'] = experiment_data['additional']['pydrm_report']['trustworthiness']
            data.loc[index, 'LCMC2'] = experiment_data['additional']['pydrm_report']['local continuity meta criterion']
    except:
        print(f'No data for {experiment_name}')
        data.loc[index, 'score2'] = 0
        data.loc[index, 'CONT2'] = 0
        data.loc[index, 'TRUST2'] = 0
        data.loc[index, 'LCMC2'] = 0

data.to_csv('hs_max_TVT_pydrm.csv')

## All values from hyperparameter search

In [203]:
data = []

In [204]:
for dataset in datasets:
    value = {'dataset': dataset, 'model': 'no_reducer', 'percentage': 100, 'score': no_reducer_score}
    for model in models:
        value['model'] = model
        percentage_data = []
        for percentage in percentages:
            value['percentage'] = percentage
            experiment_name = f'P10_{model}_{dataset}_p{percentage}'
            dim_col_name = 'config/umap_ncomp' if model == 'umap' else 'config/latent_dim' 
            if model == 'umap':
                experiment_name = experiment_name[4:]
            try:
                experiment_data = pd.read_csv(f'../experiments/{experiment_name}/data.csv')
                experiment_data = experiment_data[[dim_col_name, 'score']][experiment_data['score'] != -0.1].reset_index(drop=True)
                experiment_data.columns = ['dim', 'score']
                experiment_data['dataset'] = dataset
                experiment_data['model'] = model
                experiment_data['percent'] = percentage
                experiment_data = experiment_data[['dataset', 'model', 'percent', 'dim', 'score']]
                data.append(experiment_data)
                # data_max_value = experiment_data['score'].max()
            except:
                # data_max_value = 0
                print(f'No data for {experiment_name}')
            # value['score'] = data_max_value
            # data.append(value.copy())

In [205]:
df = pd.concat(data).reset_index(drop=True)
df

Unnamed: 0,dataset,model,percent,dim,score
0,kuhar,umap,2.5,3,0.197887
1,kuhar,umap,2.5,8,0.335681
2,kuhar,umap,2.5,4,0.230751
3,kuhar,umap,2.5,8,0.272300
4,kuhar,umap,2.5,4,0.208920
...,...,...,...,...,...
211578,realworld_waist,convtae,200.0,509,0.757821
211579,realworld_waist,convtae,200.0,540,0.730798
211580,realworld_waist,convtae,200.0,585,0.778263
211581,realworld_waist,convtae,200.0,560,0.698706


In [206]:
df.to_csv('hs_all.csv', index=False)

## Collecting the PCA results

In [207]:
data = []
value = {'dataset': 'dataset', 'model': 'pca', 'percent': 100, 'dim': 2, 'score': 0}
for dataset in datasets:
    value['dataset'] = dataset
    experiment_name = f'../execute_once_experiments/TV_sb_pca_{dataset}/scores'
    for file in os.listdir(experiment_name):
        dim = int(file[:-5])
        value['dim'] = dim
        with open(f'{experiment_name}/{file}') as f:
            experiment_data = yaml.load(f, Loader=yaml.FullLoader)
            value['score'] = experiment_data['score']
            for percent in percentages:
                if dim <= percent*360/100:
                    value['percent'] = percent
                    data.append(value.copy())

In [208]:
data = pd.DataFrame(data)
data.to_csv('hs_pca_all.csv', index=False)

In [209]:
data_max = []
for key, group in data.groupby(['dataset', 'model', 'percent']):
    data_max.append(pd.DataFrame(data.iloc[group['score'].idxmax()]).T)
    # print(key, best_val['dim'], best_val['score'])
    # print(data.iloc[group['score'].idxmax()])
data_max = pd.concat(data_max)

In [210]:
pd.DataFrame(data_max)

Unnamed: 0,dataset,model,percent,dim,score
860,kuhar,pca,2.5,7,0.506338
296,kuhar,pca,5.0,18,0.617136
711,kuhar,pca,25.0,39,0.630986
712,kuhar,pca,50.0,39,0.630986
713,kuhar,pca,75.0,39,0.630986
714,kuhar,pca,100.0,39,0.630986
715,kuhar,pca,200.0,39,0.630986
2187,motionsense,pca,2.5,9,0.572857
1323,motionsense,pca,5.0,18,0.657619
2068,motionsense,pca,25.0,69,0.692143


# Collecting topology information

In [4]:
experiment_name = f'../experiments/P10_convae_kuhar_p25_pydrm/data.csv'
data_df = pd.read_csv(experiment_name)

In [6]:
# Number of errors
data_df['score'].value_counts()

score
-0.001000    167
 0.612441      5
 0.597418      5
 0.586385      5
 0.594836      5
            ... 
 0.650704      1
 0.584507      1
 0.582629      1
 0.575352      1
 0.537559      1
Name: count, Length: 748, dtype: int64

In [13]:
# Filtering data
data_df = data_df[data_df['score'] > 0].reset_index(drop=True)
data_df['local continuity meta criterion'].max()

0.3472641509433962

In [14]:
data_df.columns

Index(['Unnamed: 0', 'score', 'randomforest-100-accuracy (mean)',
       'randomforest-100-accuracy (std)',
       'randomforest-100-f1-score macro (mean)',
       'randomforest-100-f1-score macro (std)',
       'randomforest-100-f1-score weighted (mean)',
       'randomforest-100-f1-score weighted (std)', 'KNN-5-accuracy (mean)',
       'KNN-5-accuracy (std)', 'KNN-5-f1-score macro (mean)',
       'KNN-5-f1-score macro (std)', 'KNN-5-f1-score weighted (mean)',
       'KNN-5-f1-score weighted (std)', 'SVM-rbf-C1.0-accuracy (mean)',
       'SVM-rbf-C1.0-accuracy (std)', 'SVM-rbf-C1.0-f1-score macro (mean)',
       'SVM-rbf-C1.0-f1-score macro (std)',
       'SVM-rbf-C1.0-f1-score weighted (mean)',
       'SVM-rbf-C1.0-f1-score weighted (std)', 'num_params',
       'num_trainable_params', 'residual variance (pearson)',
       'residual variance (spearman)', 'trustworthiness', 'continuity',
       'co k nearest neighbor size', 'local continuity meta criterion',
       'local property', 'g

In [None]:
data = []
value = {'dataset': 'dataset', 'model': 'pca', 'percent': 100, 'dim': 2, 'score': 0}
for dataset in datasets:
    value['dataset'] = dataset
    experiment_name = f'../execute_once_experiments/TV_sb_pca_{dataset}/scores'
    for file in os.listdir(experiment_name):
        dim = int(file[:-5])
        value['dim'] = dim
        with open(f'{experiment_name}/{file}') as f:
            experiment_data = yaml.load(f, Loader=yaml.FullLoader)
            value['score'] = experiment_data['score']
            for percent in percentages:
                if dim <= percent*360/100:
                    value['percent'] = percent
                    data.append(value.copy())