In [1]:
from pathlib import Path
import pandas as pd
csv_files = list((Path().glob('*.csv')))
dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    split = str(file).split('_')
    architecture = split[0]
    tokenizer = split[-2]
    random_weights = False
    if('RANDOM' in split):
        random_weights = True
        
    df.insert(0, 'random_weights',random_weights)
    df.insert(0, 'tokenizer',tokenizer)
    df.insert(0, 'architecture',architecture)
    
    df['file_name'] = file
    dfs.append(df)
    # print(df.head())

main_df = pd.concat(dfs, ignore_index=True)

In [2]:
dsets = list(main_df.dataset.unique())

In [3]:
# main_df.head()

In [4]:
dfs = []
for dataset_index in range(7):
    df = main_df[
        # (main_df['random_weights']==True) & 
        # (main_df['architecture']=='DEBERTA') & 
        (main_df['dataset']==dsets[dataset_index])
    ][['architecture','dataset','tokenizer','random_weights','test_acc', 'test_f1', 'test_auroc_weighted','test_pr_auc']]\
    .sort_values('tokenizer')\
    .style\
    .hide(axis='index')\
    .format({"dataset": lambda x:x.lower()})\
    .background_gradient(subset=['test_acc', 'test_f1', 'test_auroc_weighted','test_pr_auc'], cmap='Blues')
    # .highlight_max(subset=['test_auroc_weighted'],color='lightgreen')\
    # .highlight_min(subset=['test_auroc_weighted'],color='#cd4f39')\
    dfs.append(df)
    # main_df.iloc[main_df.groupby(['dataset','tokenizer'])['test_auroc_weighted'].idxmax()]
    # main_df.iloc[main_df.groupby('dataset')['test_acc'].idxmax()]
    


#### The whole table to csv

In [None]:
# main_df.to_csv('all_outputs.csv')

#### Exploration

In [13]:
from genomic_benchmarks.data_check.info import list_datasets

  from tqdm.autonotebook import tqdm


In [14]:
main_df.keys()

Index(['architecture', 'tokenizer', 'random_weights', 'dataset', 'test_acc',
       'test_f1', 'test_loss', 'test_precision', 'test_recall',
       'test_auroc_macro', 'test_auroc_weighted', 'test_pr_auc',
       'min_valid_loss_epoch', 'min_valid_loss_log', 'file_name'],
      dtype='object')

In [15]:
all_datasets = list_datasets()
all_datasets.remove('dummy_mouse_enhancers_ensembl')
all_datasets

['human_nontata_promoters',
 'demo_human_or_worm',
 'drosophila_enhancers_stark',
 'human_ocr_ensembl',
 'human_enhancers_cohn',
 'human_enhancers_ensembl',
 'demo_coding_vs_intergenomic_seqs',
 'human_ensembl_regulatory']

In [16]:
categories = ['dataset', 'architecture', 'tokenizer', 'random_weights']
exp_options = {}
for category in categories:
    exp_options[category] = []

for category in categories:
    exp_options[category].extend(main_df[category].unique())
exp_options

{'dataset': ['demo_coding_vs_intergenomic_seqs',
  'demo_human_or_worm',
  'human_enhancers_cohn',
  'human_enhancers_ensembl',
  'human_nontata_promoters',
  'human_ocr_ensembl',
  'drosophila_enhancers_stark'],
 'architecture': ['BERT', 'DEBERTA', 'PATIENCE-DEBERTA'],
 'tokenizer': ['DNABERTtokenizer',
  'Sentencepiece10tokenizer',
  'Sentencepiece30tokenizer',
  'Kmer8tokenizer',
  'Kmer7tokenizer'],
 'random_weights': [False, True]}

#### Visualise

In [30]:
def style_dataframe(
    df, 
    df_keys = ['architecture','dataset','tokenizer','random_weights','test_acc', 'test_f1', 'test_auroc_weighted','test_pr_auc'], 
    keys_to_style = ['test_acc', 'test_f1', 'test_auroc_weighted','test_pr_auc'],
    sort_by = 'tokenizer'
):
    df = df[df_keys]\
    .sort_values(sort_by)\
    .style\
    .hide(axis='index')\
    .format({"dataset": lambda x:x.lower()})\
    .background_gradient(subset=keys_to_style, cmap='Blues', axis=0)\
    
    
    return df

In [31]:
CHOOSEN_METRIC = 'test_acc'

CHOOSEN_METRIC_PRETRAINED = CHOOSEN_METRIC + '_pretrained'
CHOOSEN_METRIC_DIFF = CHOOSEN_METRIC + '_diff'
choosen_columns = ['architecture', 'tokenizer', 'dataset', CHOOSEN_METRIC]

metrics_df = main_df.copy(deep = True)
metrics_df[choosen_columns]
compared = pd.DataFrame(columns = [*choosen_columns, CHOOSEN_METRIC_PRETRAINED, CHOOSEN_METRIC_DIFF])

for dataset in exp_options['dataset']:
    for architecture in exp_options['architecture']:
        for tokenizer in exp_options['tokenizer']:
            row = metrics_df[(metrics_df['dataset'] == dataset) & (metrics_df['architecture'] == architecture) & (metrics_df['tokenizer'] == tokenizer) & (metrics_df['random_weights'] == True)][choosen_columns].copy(deep=True)
            metrics_random = row[CHOOSEN_METRIC]
            metrics_pretrained = metrics_df[(metrics_df['dataset'] == dataset) & (metrics_df['architecture'] == architecture) & (metrics_df['tokenizer'] == tokenizer) & (metrics_df['random_weights'] == False)][CHOOSEN_METRIC]

            if((metrics_pretrained.size != 0) and (metrics_random.values[0] != 0)):
                metrics_diff = metrics_pretrained.values[0] - metrics_random.values[0] 
                
                row[CHOOSEN_METRIC_PRETRAINED] = metrics_pretrained.values[0]
                row[CHOOSEN_METRIC_DIFF] = metrics_diff
            else:
                row[CHOOSEN_METRIC_PRETRAINED] = 'NaN'
                row[CHOOSEN_METRIC_DIFF] = 'NaN'
        
            compared = pd.concat([compared, row])
            
            


In [32]:
compared[(compared['test_acc_pretrained'] != 'NaN') & (compared['architecture'] == 'DEBERTA')].columns

Index(['architecture', 'tokenizer', 'dataset', 'test_acc',
       'test_acc_pretrained', 'test_acc_diff'],
      dtype='object')

In [33]:
compared.round(decimals=4)
style_dataframe(
    compared[(compared[CHOOSEN_METRIC_PRETRAINED] != 'NaN') & (compared['architecture'] == 'DEBERTA')],
    [*choosen_columns, CHOOSEN_METRIC_PRETRAINED, CHOOSEN_METRIC_DIFF],
    [CHOOSEN_METRIC_DIFF]
)

architecture,tokenizer,dataset,test_acc,test_acc_pretrained,test_acc_diff
DEBERTA,DNABERTtokenizer,demo_coding_vs_intergenomic_seqs,0.9012,0.91832,0.01712
DEBERTA,DNABERTtokenizer,drosophila_enhancers_stark,0.501734,0.686705,0.184971
DEBERTA,DNABERTtokenizer,human_ocr_ensembl,0.718156,0.7706,0.052443
DEBERTA,DNABERTtokenizer,demo_human_or_worm,0.9546,0.95932,0.00472
DEBERTA,DNABERTtokenizer,human_nontata_promoters,0.83551,0.901041,0.06553
DEBERTA,DNABERTtokenizer,human_enhancers_cohn,0.730282,0.731434,0.001151
DEBERTA,DNABERTtokenizer,human_enhancers_ensembl,0.812851,0.862577,0.049726
DEBERTA,Kmer8tokenizer,human_ocr_ensembl,0.719072,0.755579,0.036507
DEBERTA,Kmer8tokenizer,human_nontata_promoters,0.892517,0.912553,0.020035
DEBERTA,Kmer8tokenizer,human_enhancers_ensembl,0.793736,0.83442,0.040685


In [35]:
style_dataframe(
    compared[(compared[CHOOSEN_METRIC_PRETRAINED] != 'NaN') & (compared['architecture'] == 'BERT')],
    [*choosen_columns, CHOOSEN_METRIC_PRETRAINED, CHOOSEN_METRIC_DIFF],
    [CHOOSEN_METRIC_DIFF],
    sort_by = 'dataset'
)

architecture,tokenizer,dataset,test_acc,test_acc_pretrained,test_acc_diff
BERT,DNABERTtokenizer,demo_coding_vs_intergenomic_seqs,0.89956,0.9246,0.02504
BERT,DNABERTtokenizer,demo_coding_vs_intergenomic_seqs,0.90092,0.9246,0.02504
BERT,DNABERTtokenizer,demo_coding_vs_intergenomic_seqs,0.90092,0.9246,0.02504
BERT,DNABERTtokenizer,demo_human_or_worm,0.95408,0.9658,0.01172
BERT,DNABERTtokenizer,demo_human_or_worm,0.95348,0.9658,0.01172
BERT,DNABERTtokenizer,demo_human_or_worm,0.95348,0.9658,0.01172
BERT,DNABERTtokenizer,drosophila_enhancers_stark,0.626012,0.703468,0.077457
BERT,DNABERTtokenizer,human_enhancers_cohn,0.72654,0.74194,0.013529
BERT,DNABERTtokenizer,human_enhancers_cohn,0.72654,0.74194,0.013529
BERT,DNABERTtokenizer,human_enhancers_cohn,0.728411,0.74194,0.013529
