# Importing

In [1]:
from pathlib import Path
from scipy.stats.mstats import gmean
import pandas as pd
from scipy.stats import wilcoxon, ttest_ind
from scipy.stats import shapiro
from scipy.stats import friedmanchisquare
import numpy as np

## Implementation

In [2]:
def caclculate_average_for_ds(dataset_name, path):
    TF_AL_ds_av = []
    TF_non_AL_ds_av = []
    DS_ad_av = []
    
    for directory in Path(path).iterdir():
        if str(directory).split('/')[-1] != 'figures':
            if str(directory).split('/')[-1].split('_')[1] in ['SF', 'SP2']:
                print(str(directory).split('/')[-1])
                TF_AL = pd.read_csv(directory / 'TF_ML_AL.csv').iloc[:, 2:].mean()
                TF_non_AL = pd.read_csv(directory / 'TF_ML_non_AL.csv').iloc[:, 2:].mean()
                DS = pd.read_csv(directory / 'DeepSCAMs.csv').iloc[:, 2:].mean()
                TF_AL_ds_av.append(TF_AL.tolist())
                TF_non_AL_ds_av.append(TF_non_AL.tolist())
                DS_ad_av.append(DS.tolist())
    res = [np.array(it) for it in [TF_AL_ds_av, TF_non_AL_ds_av, DS_ad_av]]
    metrics = TF_AL.index
    return res, metrics
            

## SF

In [3]:
[TF_AL_ds_av_1, TF_non_AL_ds_av_1, DS_ad_av_1], met = caclculate_average_for_ds('SF', 
                                                                         '/home/khali/scams/Results/Study_2')

ADASYN_SF_TTS
SMOTE_SP2_TTS
IHT_SF_TTS
CNN_SP2_TTS
CNN_SF_TTS
SMOTE_SF_TTS
ADASYN_SP2_TTS
IHT_SP2_TTS


In [4]:
p_values_study_2 = []
for dim, m in zip(range(TF_AL_ds_av_1.shape[1]), met):
    _, p_al_non_al = wilcoxon(TF_AL_ds_av_1[:, dim], TF_non_AL_ds_av_1[:, dim])
    _, p_al_DS = wilcoxon(TF_AL_ds_av_1[:, dim], DS_ad_av_1[:, dim])
    _, p_non_al_DS = wilcoxon(TF_non_AL_ds_av_1[:, dim], DS_ad_av_1[:, dim])
    p_values_study_2.append([p_al_non_al, p_al_DS, p_non_al_DS, m])
p_values = pd.DataFrame(p_values_study_2, columns=['non-AL TF ML vs. AL TF ML', 'AL TF ML vs. DeepSCAMs', 
                                        'non-AL TF ML vs. DeepSCAMs', 'metric'])
    

In [5]:
p_values.to_csv('p_values.csv')

In [6]:
p_values

Unnamed: 0,non-AL TF ML vs. AL TF ML,AL TF ML vs. DeepSCAMs,non-AL TF ML vs. DeepSCAMs,metric
0,0.007812,0.007812,0.945312,AUC_LB_test
1,0.007812,0.007812,1.0,AUC_test
2,0.007812,0.007812,0.84375,AUC_UB_test
3,0.007812,0.007812,0.382812,Accuracy_test
4,0.007812,0.007812,0.195312,F1_test
5,0.007812,0.007812,0.546875,MCC_test
6,0.007812,0.007812,0.007812,AUC_LB_validation
7,0.007812,0.007812,0.007812,AUC_validation
8,0.007812,0.007812,0.007812,AUC_UB_validation
9,0.007812,0.007812,0.007812,Accuracy_validation


# Study design 2

In [7]:
def caclculate_p_each_study(path):
    TF_AL_TF_non_AL = []
    TF_AL_ds = []
    study_names = []
    TF_AL_codes = []
    columns = []
    for directory in Path(path).iterdir():
        if directory.is_dir():
            try:
                if str(directory).split('/')[-1].split('_')[0] in ['CNN', 'IHT', 'SMOTE', 'ADASYN']:
                    study_names.append(str(directory).split('/')[-1])
                    TF_AL = pd.read_csv(directory / 'TF_ML_AL.csv').iloc[:, 2:]
                    TF_non_AL = pd.read_csv(directory / 'TF_ML_non_AL.csv').iloc[:, 2:]
                    DS = pd.read_csv(directory / 'DeepSCAMs.csv').iloc[:, 2:]
                    run_results_1 = []
                    run_results_2 = []
                    code_for_TF_AL = []
                    for col in TF_AL.columns:
                        _, p_v_al_non_al = wilcoxon(TF_AL[col], TF_non_AL[col])
                        run_results_1.append(p_v_al_non_al*12)
                        _, p_v_al_ds = wilcoxon(TF_AL[col], DS[col])
                        run_results_2.append(p_v_al_ds*12)
                        if p_v_al_non_al*12 < 0.05 and p_v_al_ds*12 < 0.05 and TF_non_AL[col].mean() < TF_AL[col].mean() > DS[col].mean():
                            code_for_TF_AL.append('A')
                        elif TF_non_AL[col].mean() < TF_AL[col].mean() > DS[col].mean() and ((p_v_al_non_al*12 > 0.05 or p_v_al_ds*12 > 0.05) or (p_v_al_non_al*12 > 0.05 and p_v_al_ds*12 > 0.05)):
                            code_for_TF_AL.append('B')
                        else:
                            code_for_TF_AL.append('C')

                    TF_AL_TF_non_AL.append(run_results_1)
                    TF_AL_ds.append(run_results_2)
                    TF_AL_codes.append(code_for_TF_AL)
            except:
                print('not a directory')
                
    columns.append(TF_AL.columns.tolist())
    return TF_AL_TF_non_AL, TF_AL_ds, TF_AL_codes, study_names, columns

### Study 2

In [8]:
TF_AL_TF_non_AL_2, TF_AL_ds_2, TF_AL_codes_2, directories_2, columns = caclculate_p_each_study('/home/khali/scams/Results/Study_2')



In [9]:
directories_2

['ADASYN_SF_TTS',
 'SMOTE_SP2_TTS',
 'IHT_SF_TTS',
 'CNN_SP2_TTS',
 'CNN_SF_TTS',
 'SMOTE_SF_TTS',
 'ADASYN_SP2_TTS',
 'IHT_SP2_TTS']

In [10]:
TF_AL_codes_2 = pd.DataFrame(TF_AL_codes_2, index=[directories_2], columns=columns)

In [11]:
TF_AL_codes_2.to_csv('TF_AL_codes_2.csv')

In [12]:
TF_AL_codes_2.isin(['C']).sum(axis=0).sum()

0

In [13]:
TF_AL_codes_2.isin(['B']).sum(axis=0).sum()

12

In [14]:
TF_AL_codes_2.isin(['A']).sum(axis=0).sum()

84

In [15]:
12/(12+84)

0.125

In [16]:
84/(12+84)

0.875

In [17]:
TF_AL_TF_non_AL_2 = pd.DataFrame(TF_AL_TF_non_AL_2, index=[directories_2], columns=columns)

In [18]:
TF_AL_TF_non_AL_2.to_csv('TF_AL_TF_non_AL_1.csv')

In [19]:
TF_AL_ds_2 = pd.DataFrame(TF_AL_ds_2, index=[directories_2], columns=columns)

In [20]:
TF_AL_ds_2.to_csv('TF_AL_ds_2.csv')