In [1]:
### Parameters
RANDOMIZE_WEIGHTS = True 
OUTPUT_PATH = './2_BERT_RANDOM_DNABERTtokenizer_metrics.csv'

MODEL_NAME = "armheb/DNA_bert_6"
TOKENIZER_NAME = "armheb/DNA_bert_6"
K = 6
STRIDE = 1


# MODEL_NAME = "Vlasta/DNADebertaSentencepiece30k"
# TOKENIZER_NAME = "Vlasta/DNADebertaSentencepiece30k"
# K = None
# STRIDE = None



# All datasets
# DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
#  ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0),
#  ('human_ensembl_regulatory', 0), ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0)]

# Quick check dataset
# DATASETS = [('demo_human_or_worm', 0)]


# Binary classification datasets (without human_ensembl_regulatory)
DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
 ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0),
  ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0)]


# if ensemble refuses connection - "[Errno 104] Connection reset by peer", use attribute use_cloud_cache=True
BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks'
USE_CLOUD_CACHE = True
# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 1 

BATCH_SIZE = 32
ACCUMULATION = 2
LEARNING_RATE = 1e-5
EPOCHS = 100 
RUNS = 1

print(DATASETS)

[('demo_coding_vs_intergenomic_seqs', 0), ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0), ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0)]


In [2]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
warmup_ratio = 0.05 #5 epochs (for 100 epochs total train)
if(RANDOMIZE_WEIGHTS):
    warmup_ratio = 0
def get_trainargs():
    return TrainingArguments(
        'outputs', 
        learning_rate=LEARNING_RATE, 
        warmup_ratio=warmup_ratio, 
        lr_scheduler_type='linear',
        fp16=True,
        evaluation_strategy="epoch", 
        per_device_train_batch_size=BATCH_SIZE, 
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=ACCUMULATION,
        num_train_epochs=EPOCHS, 
        weight_decay=0.01,
        save_strategy='epoch',
        seed=randrange(1,10001), 
        report_to='none',
        load_best_model_at_end=True,
    )
#early stopping 5 epochs
callbacks= [
    EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.0),
]



In [3]:
from itertools import product
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
if(K is not None and K>6):
    alphabet = ('A', 'C', 'T', 'G')
    vocab = list(map(''.join, product(alphabet, repeat=K)))
    tokenizer.add_tokens(vocab)

In [4]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
if(K is not None):
    def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])), truncation=True)
else:
    def tok_func(x): return tokenizer(x["seq"], truncation=True)

# example
example = tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    
print(example)
tokenizer.decode(example['input_ids'])

{'input_ids': [2, 501, 1989, 3848, 3089, 56, 212, 835, 3325, 999, 3983, 3629, 2214, 650, 2587, 2142, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


'[CLS] ATGGAA TGGAAA GGAAAG GAAAGA AAAGAG AAGAGG AGAGGC GAGGCA AGGCAC GGCACC GCACCA CACCAT ACCATT CCATTC CATTCT [SEP]'

## Download benchmark datasets and tokenizer

In [5]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path
from tqdm.autonotebook import tqdm

for dataset_name, dataset_version in tqdm(DATASETS):
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version, use_cloud_cache=USE_CLOUD_CACHE)

benchmark_root = Path(BENCHMARKS_FOLDER)

  0%|          | 0/6 [00:00<?, ?it/s]

## Function to extract dataframe metrics row from training logs

In [6]:
def get_log_from_history(history, dataset_name):
    eval_dicts = [x for x in history if 'eval_loss' in x]
    test_dicts = [x for x in history if 'test_loss' in x]
    test_log = test_dicts[0]
    test_acc = test_log['test_accuracy']
    test_f1 = test_log['test_f1']
    test_loss = test_log['test_loss']
    test_precision = test_log['test_precision']
    test_recall = test_log['test_recall']
    test_auroc_macro = test_log['test_rocauc_0_roc_auc']
    test_auroc_weighted = test_log['test_rocauc_1_roc_auc']
    test_pr_auc = test_log['test_pr_auc']
    
    
    min_loss_dict = min(eval_dicts, key=lambda x: x['eval_loss'])
    min_loss_epoch = min_loss_dict['epoch']
    # max_f1_dict = max(eval_dicts, key=lambda x: x['eval_f1'])
    # max_acc_dict = max(eval_dicts, key=lambda x: x['eval_accuracy'])
    row = {
        'dataset':dataset_name,
        'test_acc':test_acc,
        'test_f1':test_f1,
        'test_loss':test_loss,
        'test_precision':test_precision,
        'test_recall':test_recall,
        'test_auroc_macro':test_auroc_macro,
        'test_auroc_weighted':test_auroc_weighted,
        'test_pr_auc':test_pr_auc,
        
        'min_valid_loss_epoch':min_loss_epoch,
        'min_valid_loss_log':min_loss_dict,
        # 'max_valid_f1_log':max_f1_dict,
        # 'max_valid_acc_log':max_acc_dict,
    }
    return row

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [7]:
import evaluate
binary_metrics = evaluate.combine([
    'accuracy',
    'f1',
    'recall',
    'precision',
    #Order of roc_auc matters for logging -> macro first, then weighted
    evaluate.load('roc_auc', average='macro'),
    evaluate.load('roc_auc', average='weighted'),
    evaluate.load("Vlasta/pr_auc"),
])
# binary_metrics.compute(references=[0,1,1,1], predictions=[0,0,1,1], prediction_scores=[0.4,0.3,0.6,0.9])


In [8]:
import pandas as pd
import numpy as np
from random import random, randrange
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
import torch

def compute_metrics_binary(eval_preds):
    logits, labels = eval_preds
    prediction_scores = torch.nn.functional.softmax(
        torch.from_numpy(logits).double(), dim=-1).numpy() 
    # predictions = np.argmax(logits, axis=-1) #equivalent
    predictions = np.argmax(prediction_scores, axis=-1)
    return binary_metrics.compute(
        predictions=predictions, 
        references=labels, 
        prediction_scores=prediction_scores[:,1] #taking only prediction percentage for the label 1
    )
    
#TODO human_ensembl_regulatory dataset multilabel metrics
def compute_metrics_multi(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })
    train_valid_split = dds['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
    dds['train']=train_valid_split['train']
    dds['valid']=train_valid_split['test']

    compute_metrics = compute_metrics_binary if len(labels) == 2 else compute_metrics_multi

    for _ in range(RUNS):
        model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
        if(RANDOMIZE_WEIGHTS):
            # model_cls.init_weights() #Alternative
            model_cls = AutoModelForSequenceClassification.from_config(model_cls.config)            
            
        args = get_trainargs()
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['valid'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics, 
                          callbacks=callbacks)
        trainer.train()
        trainer.evaluate(dds['test'], metric_key_prefix='test')
        training_log = get_log_from_history(trainer.state.log_history, dataset_name=dataset_name)
        outputs.append(training_log)
  

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

Some weights of the model checkpoint at armheb/DNA_bert_6 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at armheb/DNA_bert_6 and are n

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.4756,0.375014,0.839867,0.858656,0.97293,0.768404,0.960598,0.960598,0.958698
1,0.3211,0.247556,0.901533,0.89979,0.884251,0.915884,0.964068,0.964068,0.962223
2,0.2772,0.249646,0.8976,0.902078,0.943459,0.864175,0.965583,0.965583,0.96364
3,0.2665,0.282832,0.892333,0.886308,0.839445,0.938712,0.965513,0.965513,0.963414
4,0.2545,0.267603,0.893733,0.899736,0.953727,0.85153,0.965821,0.965821,0.96344
5,0.2504,0.239493,0.904067,0.903791,0.90132,0.906275,0.966268,0.966268,0.964151
6,0.2494,0.278162,0.892067,0.8984,0.954527,0.848506,0.96578,0.96578,0.963759
7,0.234,0.245128,0.902067,0.90303,0.912122,0.894118,0.965776,0.965776,0.963882
8,0.2324,0.253581,0.899067,0.895759,0.867449,0.925979,0.965663,0.965663,0.963618
9,0.2233,0.258079,0.901533,0.901118,0.897453,0.904813,0.965199,0.965199,0.963038


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.4006,0.150023,0.944067,0.94436,0.94946,0.939314,0.98618,0.98618,0.987185
1,0.2093,0.251255,0.9006,0.890408,0.807708,0.991975,0.988207,0.988207,0.988808
2,0.1805,0.195736,0.9188,0.92378,0.984265,0.870298,0.989485,0.989485,0.989961
3,0.1534,0.290661,0.877067,0.889818,0.992932,0.806106,0.989845,0.989845,0.990257
4,0.1389,0.131009,0.951,0.950207,0.935191,0.965712,0.990117,0.990117,0.99052
5,0.1309,0.140177,0.948,0.948946,0.966662,0.931868,0.990251,0.990251,0.990631
6,0.127,0.132896,0.954267,0.954328,0.955727,0.952932,0.99034,0.99034,0.990763
7,0.1232,0.207253,0.920333,0.925139,0.984665,0.872401,0.990242,0.990242,0.990553
8,0.1169,0.130319,0.9508,0.94981,0.931191,0.969188,0.990162,0.990162,0.990677
9,0.1091,0.1586,0.950533,0.951068,0.961595,0.94077,0.990048,0.990048,0.990573


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/27791 [00:00<?, ?ex/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,No log,0.591932,0.675222,0.617081,0.519524,0.759749,0.764849,0.764849,0.76242
2,0.641800,0.631084,0.648357,0.728015,0.934286,0.596353,0.785184,0.785184,0.781044
3,0.641800,0.601782,0.669945,0.737805,0.921905,0.614994,0.796742,0.796742,0.791766
4,0.580100,0.534566,0.723675,0.730337,0.742857,0.718232,0.808474,0.808474,0.80427
5,0.580100,0.542783,0.722236,0.752564,0.838571,0.682558,0.81341,0.81341,0.810408
6,0.541200,0.568523,0.729911,0.710689,0.658571,0.771763,0.814495,0.814495,0.812637
7,0.541200,0.556883,0.710722,0.754579,0.882857,0.658849,0.815661,0.815661,0.813927
8,0.543100,0.581294,0.702327,0.751153,0.891905,0.64877,0.814589,0.814589,0.814041
9,0.543100,0.54206,0.730391,0.753725,0.819048,0.698052,0.8155,0.8155,0.81444


***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-261
Configuration saved in outputs/checkpoint-261/config.json
Model weights saved in outputs/checkpoint-261/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-261/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-261/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-522
Configuration saved in outputs/checkpoint-522/config.json
Model weights saved in outputs/checkpoint-522/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-522/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-522/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-783
Configuration saved in outputs/checkpoint-783/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/123872 [00:00<?, ?ex/s]

  0%|          | 0/124 [00:00<?, ?ba/s]

  0%|          | 0/124 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.5816,0.529094,0.743125,0.728853,0.68639,0.776917,0.821179,0.821179,0.808734
1,0.5364,0.770597,0.598407,0.355168,0.21988,0.923181,0.842455,0.842455,0.835671
2,0.5246,0.540472,0.735913,0.774069,0.899422,0.679383,0.851604,0.851604,0.848344
3,0.4903,0.47309,0.779936,0.796981,0.858763,0.743492,0.86851,0.86851,0.868454
4,0.462,0.46294,0.789516,0.806069,0.869677,0.751132,0.880862,0.880862,0.884013
5,0.4252,0.445178,0.802271,0.813805,0.859084,0.77306,0.888822,0.888822,0.891637
6,0.4075,0.436226,0.805823,0.817298,0.863471,0.775812,0.891862,0.891862,0.895077
7,0.392,0.502817,0.781713,0.808553,0.916435,0.723395,0.893357,0.893357,0.895799
8,0.3817,0.463654,0.797858,0.815593,0.888722,0.753584,0.894746,0.894746,0.897865
9,0.3644,0.44437,0.80916,0.823792,0.886903,0.769067,0.89821,0.89821,0.899471


***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1161
Configuration saved in outputs/checkpoint-1161/config.json
Model weights saved in outputs/checkpoint-1161/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1161/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1161/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2322
Configuration saved in outputs/checkpoint-2322/config.json
Model weights saved in outputs/checkpoint-2322/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-2322/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-2322/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-3483
Configuration saved in outputs/checkpoint-3483/config.json
Model weights sa

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/25284 [00:00<?, ?ex/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,No log,0.362216,0.877231,0.658683,0.517473,0.905882,0.875906,0.875906,0.794112
1,No log,0.277632,0.907077,0.760317,0.643817,0.928295,0.897646,0.897646,0.838101
2,0.407500,0.281966,0.913846,0.781931,0.674731,0.92963,0.903393,0.903393,0.854167
3,0.407500,0.249453,0.921231,0.802164,0.697581,0.943636,0.909788,0.909788,0.869171
4,0.304300,0.275132,0.897846,0.781866,0.799731,0.764781,0.914722,0.914722,0.878826
5,0.304300,0.274246,0.896,0.780234,0.806452,0.755668,0.918787,0.918787,0.884332
6,0.304300,0.219542,0.931077,0.828746,0.728495,0.960993,0.920056,0.920056,0.886932
7,0.257800,0.239042,0.917538,0.814404,0.790323,0.84,0.921659,0.921659,0.888975
8,0.257800,0.270258,0.908615,0.801603,0.806452,0.796813,0.925008,0.925008,0.892064
9,0.252300,0.300083,0.896,0.784163,0.825269,0.746959,0.926146,0.926146,0.893587


***** Running Evaluation *****
  Num examples = 3250
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-203
Configuration saved in outputs/checkpoint-203/config.json
Model weights saved in outputs/checkpoint-203/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-203/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-203/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-406
Configuration saved in outputs/checkpoint-406/config.json
Model weights saved in outputs/checkpoint-406/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-406/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-406/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-609
Configuration saved in outputs/checkpoint-609/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/139804 [00:00<?, ?ex/s]

  0%|          | 0/140 [00:00<?, ?ba/s]

  0%|          | 0/140 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,0.6653,0.614681,0.662009,0.624696,0.555985,0.712784,0.741403,0.741403,0.724699
2,0.6046,0.592385,0.69043,0.731291,0.832611,0.651956,0.768636,0.768636,0.754169
3,0.593,0.599074,0.687902,0.64135,0.551555,0.766069,0.778933,0.778933,0.768045
4,0.5747,0.611774,0.659434,0.731927,0.918944,0.608159,0.782421,0.782421,0.77323
5,0.5629,0.576635,0.708598,0.735489,0.800754,0.680061,0.78566,0.78566,0.772971
6,0.5534,0.56316,0.711506,0.732632,0.781244,0.689715,0.787967,0.787967,0.780099
7,0.5441,0.556632,0.713509,0.722955,0.738831,0.707746,0.790431,0.790431,0.780741
8,0.5324,0.567656,0.712317,0.72576,0.752403,0.70094,0.788403,0.788403,0.779806
9,0.5228,0.572157,0.71041,0.724393,0.752215,0.698556,0.785751,0.785751,0.777869
10,0.5182,0.609391,0.688665,0.740346,0.877286,0.640385,0.786269,0.786269,0.777144


***** Running Evaluation *****
  Num examples = 20971
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1311
Configuration saved in outputs/checkpoint-1311/config.json
Model weights saved in outputs/checkpoint-1311/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1311/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1311/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20971
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2622
Configuration saved in outputs/checkpoint-2622/config.json
Model weights saved in outputs/checkpoint-2622/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-2622/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-2622/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20971
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-3933
Configuration saved in outputs/checkpoint-3933/config.json
Model weights sa

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


## Outputs

In [9]:
outputs_df = pd.DataFrame(outputs)
outputs_df

Unnamed: 0,dataset,test_acc,test_f1,test_loss,test_precision,test_recall,test_auroc_macro,test_auroc_weighted,test_pr_auc,min_valid_loss_epoch,min_valid_loss_log
0,demo_coding_vs_intergenomic_seqs,0.90092,0.900606,0.251063,0.90347,0.89776,0.963096,0.963096,0.960968,6.0,"{'eval_loss': 0.2394927442073822, 'eval_accura..."
1,demo_human_or_worm,0.95348,0.95271,0.124923,0.968742,0.9372,0.991099,0.991099,0.991605,9.0,"{'eval_loss': 0.13031931221485138, 'eval_accur..."
2,human_enhancers_cohn,0.72654,0.733819,0.548673,0.714793,0.753886,0.800259,0.800259,0.789622,4.0,"{'eval_loss': 0.5345657467842102, 'eval_accura..."
3,human_enhancers_ensembl,0.799871,0.811748,0.443561,0.766271,0.862964,0.889409,0.889409,0.892675,7.0,"{'eval_loss': 0.4362263083457947, 'eval_accura..."
4,human_nontata_promoters,0.823998,0.809398,0.466183,0.985118,0.686877,0.920357,0.920357,0.94752,7.0,"{'eval_loss': 0.219541996717453, 'eval_accurac..."
5,human_ocr_ensembl,0.714952,0.721322,0.559873,0.705554,0.737812,0.788795,0.788795,0.775356,7.0,"{'eval_loss': 0.556632399559021, 'eval_accurac..."


In [10]:
# outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

In [11]:
# saving outputs to csv file
outputs_df.to_csv(OUTPUT_PATH, index=False)