In [1]:
!nvidia-smi

Sun Aug 14 23:29:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40          On   | 00000000:A3:00.0 Off |                    0 |
|  0%   45C    P8    36W / 300W |      0MiB / 46068MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
### Parameters
RANDOMIZE_WEIGHTS = False 
RESIZE_EMBEDDINGS = False #only used for using tokenizers with different vocab_size than orig. model

OUTPUT_PATH = './DEBERTA_Kmer8tokenizer_metrics.csv'

MODEL_NAME = "davidcechak/DNADebertaK8b"
TOKENIZER_NAME = "armheb/DNA_bert_6"
K = 8
STRIDE = 1


# MODEL_NAME = "Vlasta/DNADebertaSentencepiece30k"
# TOKENIZER_NAME = "Vlasta/DNADebertaSentencepiece30k"
# K = None
# STRIDE = None



# All datasets
# DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
#  ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0),
#  ('human_ensembl_regulatory', 0), ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0), ('drosophila_enhancers_stark', 0)]

# Quick check dataset
# DATASETS = [('demo_human_or_worm', 0)]


# Binary classification datasets (without human_ensembl_regulatory)
DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
 ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0),
  ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0), ('drosophila_enhancers_stark', 0)]

# if ensemble refuses connection - "[Errno 104] Connection reset by peer", use attribute use_cloud_cache=True
BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks'
USE_CLOUD_CACHE = True
# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 1 

BATCH_SIZE = 32
ACCUMULATION = 2
LEARNING_RATE = 1e-5
EPOCHS = 100 
RUNS = 1

print(DATASETS)

[('demo_coding_vs_intergenomic_seqs', 0), ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0), ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0), ('drosophila_enhancers_stark', 0)]


In [3]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
warmup_ratio = 0.05 #5 epochs (for 100 epochs total train)
if(RANDOMIZE_WEIGHTS):
    warmup_ratio = 0
def get_trainargs():
    return TrainingArguments(
        'outputs', 
        learning_rate=LEARNING_RATE, 
        warmup_ratio=warmup_ratio, 
        lr_scheduler_type='linear',
        fp16=True,
        evaluation_strategy="epoch", 
        per_device_train_batch_size=BATCH_SIZE, 
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=ACCUMULATION,
        num_train_epochs=EPOCHS, 
        weight_decay=0.01,
        save_strategy='epoch',
        seed=randrange(1,10001), 
        report_to='none',
        load_best_model_at_end=True,
    )
#early stopping 5 epochs
callbacks= [
    EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.0),
]

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from itertools import product
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
if(K is not None and K>6):
    alphabet = ('A', 'C', 'T', 'G')
    vocab = list(map(''.join, product(alphabet, repeat=K)))
    tokenizer.add_tokens(vocab)

In [5]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
if(K is not None):
    def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])), truncation=True)
else:
    def tok_func(x): return tokenizer(x["seq"], truncation=True)

# example
example = tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    
print(example)
tokenizer.decode(example['input_ids'])

{'input_ids': [2, 16136, 52241, 65592, 53460, 4930, 7417, 17366, 57162, 19737, 66647, 57679, 21806, 9387, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


'[CLS] ATGGAAAG TGGAAAGA GGAAAGAG GAAAGAGG AAAGAGGC AAGAGGCA AGAGGCAC GAGGCACC AGGCACCA GGCACCAT GCACCATT CACCATTC ACCATTCT [SEP]'

## Download benchmark datasets and tokenizer

In [6]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path
from tqdm.autonotebook import tqdm

for dataset_name, dataset_version in tqdm(DATASETS):
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version, use_cloud_cache=USE_CLOUD_CACHE)

benchmark_root = Path(BENCHMARKS_FOLDER)

100%|██████████| 7/7 [00:00<00:00, 21290.88it/s]


## Function to extract dataframe metrics row from training logs

In [7]:
def get_log_from_history(history, dataset_name):
    eval_dicts = [x for x in history if 'eval_loss' in x]
    test_dicts = [x for x in history if 'test_loss' in x]
    test_log = test_dicts[0]
    test_acc = test_log['test_accuracy']
    test_f1 = test_log['test_f1']
    test_loss = test_log['test_loss']
    test_precision = test_log['test_precision']
    test_recall = test_log['test_recall']
    test_auroc_macro = test_log['test_rocauc_0_roc_auc']
    test_auroc_weighted = test_log['test_rocauc_1_roc_auc']
    test_pr_auc = test_log['test_pr_auc']
    
    
    min_loss_dict = min(eval_dicts, key=lambda x: x['eval_loss'])
    min_loss_epoch = min_loss_dict['epoch']
    # max_f1_dict = max(eval_dicts, key=lambda x: x['eval_f1'])
    # max_acc_dict = max(eval_dicts, key=lambda x: x['eval_accuracy'])
    row = {
        'dataset':dataset_name,
        'test_acc':test_acc,
        'test_f1':test_f1,
        'test_loss':test_loss,
        'test_precision':test_precision,
        'test_recall':test_recall,
        'test_auroc_macro':test_auroc_macro,
        'test_auroc_weighted':test_auroc_weighted,
        'test_pr_auc':test_pr_auc,
        
        'min_valid_loss_epoch':min_loss_epoch,
        'min_valid_loss_log':min_loss_dict,
        # 'max_valid_f1_log':max_f1_dict,
        # 'max_valid_acc_log':max_acc_dict,
    }
    return row

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [8]:
import evaluate
binary_metrics = evaluate.combine([
    'accuracy',
    'f1',
    'recall',
    'precision',
    #Order of roc_auc matters for logging -> macro first, then weighted
    evaluate.load('roc_auc', average='macro'),
    evaluate.load('roc_auc', average='weighted'),
    evaluate.load("Vlasta/pr_auc"),
])
# binary_metrics.compute(references=[0,1,1,1], predictions=[0,0,1,1], prediction_scores=[0.4,0.3,0.6,0.9])


In [None]:
import pandas as pd
import numpy as np
from random import random, randrange
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
import torch

def compute_metrics_binary(eval_preds):
    logits, labels = eval_preds
    prediction_scores = torch.nn.functional.softmax(
        torch.from_numpy(logits).double(), dim=-1).numpy() 
    # predictions = np.argmax(logits, axis=-1) #equivalent
    predictions = np.argmax(prediction_scores, axis=-1)
    return binary_metrics.compute(
        predictions=predictions, 
        references=labels, 
        prediction_scores=prediction_scores[:,1] #taking only prediction percentage for the label 1
    )
    
#TODO human_ensembl_regulatory dataset multilabel metrics
def compute_metrics_multi(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })
    train_valid_split = dds['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
    dds['train']=train_valid_split['train']
    dds['valid']=train_valid_split['test']

    compute_metrics = compute_metrics_binary if len(labels) == 2 else compute_metrics_multi

    for _ in range(RUNS):
        model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
        if(RANDOMIZE_WEIGHTS):
            # model_cls.init_weights() #Alternative
            model_cls = AutoModelForSequenceClassification.from_config(model_cls.config)   
            if(RESIZE_EMBEDDINGS):
                model_cls.resize_token_embeddings(len(tokenizer))
            
        args = get_trainargs()
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['valid'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics, 
                          callbacks=callbacks)
        trainer.train()
        trainer.evaluate(dds['test'], metric_key_prefix='test')
        training_log = get_log_from_history(trainer.state.log_history, dataset_name=dataset_name)
        outputs.append(training_log)
  

  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/100000 [00:00<?, ?ex/s][A
  0%|          | 180/100000 [00:00<00:55, 1794.40ex/s][A
  0%|          | 379/100000 [00:00<00:52, 1906.14ex/s][A
  1%|          | 583/100000 [00:00<00:50, 1967.15ex/s][A
  1%|          | 787/100000 [00:00<00:49, 1993.22ex/s][A
  1%|          | 987/100000 [00:00<00:50, 1970.58ex/s][A
  1%|          | 1185/100000 [00:00<00:55, 1765.30ex/s][A
  1%|▏         | 1391/100000 [00:00<00:53, 1852.83ex/s][A
  2%|▏         | 1588/100000 [00:00<00:52, 1888.00ex/s][A
  2%|▏         | 1789/100000 [00:00<00:51, 1923.07ex/s][A
  2%|▏         | 1996/100000 [00:01<00:49, 1964.87ex/s][A
  2%|▏         | 2194/100000 [00:01<00:54, 1792.15ex/s][A
  2%|▏         | 2395/100000 [00:01<00:52, 1851.20ex/s][A
  3%|▎         | 2598/100000 [00:01<00:51, 1901.19ex/s][A
  3%|▎         | 2805/100000 [00:01<00:49, 1947.30ex/s][A
  3%|▎         | 3002/100000 [00:01<00:54, 1786.22ex/s][A
  3%|▎         | 3201/100000 [00:01<

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6366,0.372094,0.8814,0.876037,0.838245,0.917396,0.951186,0.951186,0.949966
1,0.3292,0.27616,0.899,0.8943,0.854647,0.937811,0.965774,0.965774,0.965687
2,0.2243,0.238542,0.911933,0.909897,0.889452,0.931304,0.97096,0.97096,0.970544
3,0.161,0.237422,0.91,0.9066,0.873716,0.942056,0.973529,0.973529,0.973457
4,0.1182,0.252106,0.910867,0.908831,0.888652,0.929947,0.97103,0.97103,0.972356
5,0.0916,0.336601,0.911933,0.909922,0.889719,0.931063,0.9699,0.9699,0.971637
6,0.0742,0.349274,0.909467,0.907126,0.884385,0.931068,0.968681,0.968681,0.970442
7,0.0579,0.465542,0.903067,0.898307,0.856381,0.944551,0.965833,0.965833,0.967671
8,0.0427,0.464185,0.9092,0.907195,0.887718,0.927546,0.961924,0.961924,0.963262


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
 14%|█▍        | 1/7 [36:10<3:37:00, 2170.02s/it]
  0%|          | 0/100000 [00:00<?, ?ex/s][A
  0%|          | 198/100000 [00:00<00:50, 1978.38ex/s][A
  0%|          | 405/100000 [00:00<00:49, 2032.17ex/s][A
  1%|          | 617/100000 [00:00<00:48, 2068.79ex/s][A
  1%|          | 829/100000 [00:00<00:47, 2087.59ex/s][A
  1%|          | 1038/100000 [00:00<00:53, 1851.96ex/s][A
  1%|          | 1228/100000 [00:00<01:02, 1588.50ex/s][A
  1%|▏         | 1431/100000 [00:00<00:57, 1708.79ex/s][A
  2%|▏         | 1627/100000 [00:00<00:55, 1777.84ex/s][A
  2%|▏         | 1835/100000 [00:00<00:52, 1864.34ex/s][A
  2%|▏         | 2027/100000 [00:01<00:56, 1741.58ex/s][A
  2%|▏         | 2230/100000 [00:01<00:53, 1821.58ex/s][A
  2%|▏         | 2438/100000 [00:01<00:51, 1894.55ex/s][A
  3%|▎         | 2647/100000 [00:01<00:49, 1949.66ex/s][A
  3%|▎         | 2855/100000 [00:01<0

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6458,0.321309,0.925067,0.927013,0.95186,0.90343,0.978778,0.978778,0.979016
1,0.2577,0.215284,0.930733,0.934019,0.980664,0.89161,0.988806,0.988806,0.988744
2,0.1299,0.163547,0.943067,0.945284,0.983731,0.90973,0.992407,0.992407,0.99238
3,0.075,0.1418,0.956733,0.957551,0.97613,0.939666,0.993166,0.993166,0.993197
4,0.0483,0.199772,0.944267,0.946547,0.987065,0.909225,0.993386,0.993386,0.993408
5,0.0286,0.233461,0.954267,0.955455,0.981064,0.931148,0.991605,0.991605,0.989454
6,0.016,0.454115,0.924533,0.929374,0.993199,0.873256,0.991995,0.991995,0.99017
7,0.0111,0.382237,0.944867,0.947119,0.987598,0.909828,0.98664,0.98664,0.977326
8,0.01,0.282819,0.959333,0.959528,0.964262,0.95484,0.991332,0.991332,0.990518


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
 29%|██▊       | 2/7 [1:12:13<3:00:31, 2166.31s/it]
  0%|          | 0/27791 [00:00<?, ?ex/s][A
  0%|          | 85/27791 [00:00<00:32, 843.97ex/s][A
  1%|          | 177/27791 [00:00<00:31, 887.10ex/s][A
  1%|          | 270/27791 [00:00<00:30, 902.30ex/s][A
  1%|▏         | 363/27791 [00:00<00:30, 912.42ex/s][A
  2%|▏         | 455/27791 [00:00<00:29, 912.07ex/s][A
  2%|▏         | 549/27791 [00:00<00:29, 919.46ex/s][A
  2%|▏         | 642/27791 [00:00<00:29, 920.07ex/s][A
  3%|▎         | 735/27791 [00:00<00:29, 920.62ex/s][A
  3%|▎         | 828/27791 [00:00<00:29, 918.26ex/s][A
  3%|▎         | 921/27791 [00:01<00:29, 919.56ex/s][A
  4%|▎         | 1013/27791 [00:01<00:37, 712.00ex/s][A
  4%|▍         | 1103/27791 [00:01<00:35, 758.20ex/s][A
  4%|▍         | 1196/27791 [00:01<00:33, 801.37ex/s][A
  5%|▍         | 1289/27791 [00:01<00:31, 835.06ex/s][A
  5%|▍      

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,No log,0.625388,0.692492,0.684392,0.661905,0.708461,0.777092,0.777092,0.775182
2,0.634400,0.566003,0.721036,0.747668,0.820476,0.686728,0.79566,0.79566,0.769869
3,0.634400,0.553373,0.724634,0.725621,0.722857,0.728407,0.810672,0.810672,0.806341
4,0.495800,0.573053,0.726313,0.752656,0.826667,0.690808,0.810328,0.810328,0.804055
5,0.495800,0.6244,0.721516,0.742057,0.795238,0.695544,0.800658,0.800658,0.791315
6,0.369400,0.680423,0.725114,0.741426,0.782381,0.704545,0.785697,0.785697,0.766954
7,0.369400,0.755668,0.719357,0.745431,0.815714,0.686298,0.783844,0.783844,0.748591
8,0.270800,0.902083,0.693212,0.746683,0.897619,0.6392,0.721362,0.721362,0.634465


***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-261
Configuration saved in outputs/checkpoint-261/config.json
Model weights saved in outputs/checkpoint-261/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-261/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-261/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-522
Configuration saved in outputs/checkpoint-522/config.json
Model weights saved in outputs/checkpoint-522/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-522/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-522/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-783
Configuration saved in outputs/checkpoint-783/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
 43%|████▎     | 3/7 [1:38:56<2:07:15, 1908.80s/it]
  0%|          | 0/123872 [00:00<?, ?ex/s][A
  0%|          | 146/123872 [00:00<01:24, 1459.69ex/s][A
  0%|          | 303/123872 [00:00<01:21, 1522.85ex/s][A
  0%|          | 463/123872 [00:00<01:19, 1556.62ex/s][A
  1%|          | 627/123872 [00:00<01:17, 1586.67ex/s][A
  1%|          | 788/123872 [00:00<01:17, 1591.56ex/s][A
  1%|          | 948/123872 [00:00<01:18, 1574.30ex/s][A
  1%|          | 1106/123872 [00:00<01:32, 1327.39ex/s][A
  1%|          | 1251/123872 [00:00<01:30, 1360.07ex/s][A
  1%|          | 1402/123872 [00:00<01:27, 1400.37ex/s][A
  1%|▏         | 1560/123872 [00:01<01:24, 1451.94ex/s][A
  1%|▏         | 1708/123872 [00:01<01:23, 1459.85ex/s][A
  2%|▏         | 1866/123872 [00:01<01:21, 1494.84ex/s][A
  2%|▏         | 2017/123872 [00:01<01:33, 1304.10ex/s][A
  2%|▏         | 2172/123872 [00:01<0

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6006,0.54669,0.741295,0.747915,0.763,0.733416,0.810933,0.810933,0.785388
1,0.497,0.440221,0.802164,0.805482,0.814359,0.796796,0.884742,0.884742,0.881778
2,0.4061,0.386642,0.828104,0.829744,0.832763,0.826747,0.909656,0.909656,0.907823
3,0.299,0.401136,0.833432,0.84644,0.91269,0.789157,0.924077,0.924077,0.922131
4,0.2086,0.408826,0.849416,0.860351,0.922213,0.806268,0.936851,0.936851,0.936061
5,0.1371,0.424703,0.858242,0.866118,0.91162,0.824942,0.937763,0.937763,0.937415


***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1161
Configuration saved in outputs/checkpoint-1161/config.json
Model weights saved in outputs/checkpoint-1161/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1161/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1161/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2322
Configuration saved in outputs/checkpoint-2322/config.json
Model weights saved in outputs/checkpoint-2322/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-2322/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-2322/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-3483
Configuration saved in outputs/checkpoint-3483/config.json
Model weights sa

## Outputs

In [None]:
outputs_df = pd.DataFrame(outputs)
outputs_df

In [None]:
# outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

In [None]:
# saving outputs to csv file
outputs_df.to_csv(OUTPUT_PATH, index=False)