In [1]:
### Parameters
RANDOMIZE_WEIGHTS = True 
RESIZE_EMBEDDINGS = True #only used for using tokenizers with different vocab_size than orig. model

OUTPUT_PATH = './DNABERT_RANDOM_Sentencepiece10tokenizer_metrics.csv'

MODEL_NAME = "armheb/DNA_bert_6"
TOKENIZER_NAME = "Vlasta/DNA_Sentencepiece_vocab_10000_max_tokenlen_45"
K = None
STRIDE = None


# MODEL_NAME = "Vlasta/DNADebertaSentencepiece30k"
# TOKENIZER_NAME = "Vlasta/DNADebertaSentencepiece30k"
# K = None
# STRIDE = None



# All datasets
# DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
#  ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0),
#  ('human_ensembl_regulatory', 0), ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0)]

# Quick check dataset
# DATASETS = [('demo_human_or_worm', 0)]


# Binary classification datasets (without human_ensembl_regulatory)
DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
 ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0),
  ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0)]


# if ensemble refuses connection - "[Errno 104] Connection reset by peer", use attribute use_cloud_cache=True
BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks'
USE_CLOUD_CACHE = True
# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 1 

BATCH_SIZE = 32
ACCUMULATION = 2
LEARNING_RATE = 1e-5
EPOCHS = 100 
RUNS = 1

print(DATASETS)

[('demo_coding_vs_intergenomic_seqs', 0), ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0), ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0)]


In [2]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
warmup_ratio = 0.05 #5 epochs (for 100 epochs total train)
if(RANDOMIZE_WEIGHTS):
    warmup_ratio = 0
def get_trainargs():
    return TrainingArguments(
        'outputs', 
        learning_rate=LEARNING_RATE, 
        warmup_ratio=warmup_ratio, 
        lr_scheduler_type='linear',
        fp16=True,
        evaluation_strategy="epoch", 
        per_device_train_batch_size=BATCH_SIZE, 
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=ACCUMULATION,
        num_train_epochs=EPOCHS, 
        weight_decay=0.01,
        save_strategy='epoch',
        seed=randrange(1,10001), 
        report_to='none',
        load_best_model_at_end=True,
    )
#early stopping 5 epochs
callbacks= [
    EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.0),
]



In [3]:
from itertools import product
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
if(K is not None and K>6):
    alphabet = ('A', 'C', 'T', 'G')
    vocab = list(map(''.join, product(alphabet, repeat=K)))
    tokenizer.add_tokens(vocab)

In [4]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
if(K is not None):
    def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])), truncation=True)
else:
    def tok_func(x): return tokenizer(x["seq"], truncation=True)

# example
example = tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    
print(example)
tokenizer.decode(example['input_ids'])

{'input_ids': [2, 33, 1246, 2031, 6, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


'[CLS]ATGGAAAGAGGCACCATTCT[SEP]'

## Download benchmark datasets and tokenizer

In [5]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path
from tqdm.autonotebook import tqdm

for dataset_name, dataset_version in tqdm(DATASETS):
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version, use_cloud_cache=USE_CLOUD_CACHE)

benchmark_root = Path(BENCHMARKS_FOLDER)

  0%|          | 0/6 [00:00<?, ?it/s]

## Function to extract dataframe metrics row from training logs

In [6]:
def get_log_from_history(history, dataset_name):
    eval_dicts = [x for x in history if 'eval_loss' in x]
    test_dicts = [x for x in history if 'test_loss' in x]
    test_log = test_dicts[0]
    test_acc = test_log['test_accuracy']
    test_f1 = test_log['test_f1']
    test_loss = test_log['test_loss']
    test_precision = test_log['test_precision']
    test_recall = test_log['test_recall']
    test_auroc_macro = test_log['test_rocauc_0_roc_auc']
    test_auroc_weighted = test_log['test_rocauc_1_roc_auc']
    test_pr_auc = test_log['test_pr_auc']
    
    
    min_loss_dict = min(eval_dicts, key=lambda x: x['eval_loss'])
    min_loss_epoch = min_loss_dict['epoch']
    # max_f1_dict = max(eval_dicts, key=lambda x: x['eval_f1'])
    # max_acc_dict = max(eval_dicts, key=lambda x: x['eval_accuracy'])
    row = {
        'dataset':dataset_name,
        'test_acc':test_acc,
        'test_f1':test_f1,
        'test_loss':test_loss,
        'test_precision':test_precision,
        'test_recall':test_recall,
        'test_auroc_macro':test_auroc_macro,
        'test_auroc_weighted':test_auroc_weighted,
        'test_pr_auc':test_pr_auc,
        
        'min_valid_loss_epoch':min_loss_epoch,
        'min_valid_loss_log':min_loss_dict,
        # 'max_valid_f1_log':max_f1_dict,
        # 'max_valid_acc_log':max_acc_dict,
    }
    return row

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [7]:
import evaluate
binary_metrics = evaluate.combine([
    'accuracy',
    'f1',
    'recall',
    'precision',
    #Order of roc_auc matters for logging -> macro first, then weighted
    evaluate.load('roc_auc', average='macro'),
    evaluate.load('roc_auc', average='weighted'),
    evaluate.load("Vlasta/pr_auc"),
])
# binary_metrics.compute(references=[0,1,1,1], predictions=[0,0,1,1], prediction_scores=[0.4,0.3,0.6,0.9])


In [8]:
import pandas as pd
import numpy as np
from random import random, randrange
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
import torch

def compute_metrics_binary(eval_preds):
    logits, labels = eval_preds
    prediction_scores = torch.nn.functional.softmax(
        torch.from_numpy(logits).double(), dim=-1).numpy() 
    # predictions = np.argmax(logits, axis=-1) #equivalent
    predictions = np.argmax(prediction_scores, axis=-1)
    return binary_metrics.compute(
        predictions=predictions, 
        references=labels, 
        prediction_scores=prediction_scores[:,1] #taking only prediction percentage for the label 1
    )
    
#TODO human_ensembl_regulatory dataset multilabel metrics
def compute_metrics_multi(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })
    train_valid_split = dds['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
    dds['train']=train_valid_split['train']
    dds['valid']=train_valid_split['test']

    compute_metrics = compute_metrics_binary if len(labels) == 2 else compute_metrics_multi

    for _ in range(RUNS):
        model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
        if(RANDOMIZE_WEIGHTS):
            # model_cls.init_weights() #Alternative
            model_cls = AutoModelForSequenceClassification.from_config(model_cls.config)   
            if(RESIZE_EMBEDDINGS):
                model_cls.resize_token_embeddings(len(tokenizer))
            
        args = get_trainargs()
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['valid'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics, 
                          callbacks=callbacks)
        trainer.train()
        trainer.evaluate(dds['test'], metric_key_prefix='test')
        training_log = get_log_from_history(trainer.state.log_history, dataset_name=dataset_name)
        outputs.append(training_log)
  

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

Some weights of the model checkpoint at armheb/DNA_bert_6 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at armheb/DNA_bert_6 and are n

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.4668,0.359022,0.841267,0.823695,0.741699,0.926074,0.93876,0.93876,0.93891
1,0.3313,0.331437,0.854933,0.86624,0.939592,0.803512,0.946566,0.946566,0.946362
2,0.2983,0.296228,0.879467,0.882658,0.906788,0.85978,0.949679,0.949679,0.949599
3,0.2852,0.313799,0.866333,0.874899,0.934925,0.822115,0.950455,0.950455,0.949793
4,0.263,0.398965,0.850667,0.864767,0.955061,0.790072,0.951299,0.951299,0.950616
5,0.2527,0.349423,0.8496,0.830503,0.737032,0.951127,0.952094,0.952094,0.951372
6,0.244,0.283413,0.8824,0.880342,0.865315,0.895899,0.951553,0.951553,0.9507
7,0.2385,0.306369,0.879733,0.883613,0.913188,0.855893,0.950738,0.950738,0.949924
8,0.2283,0.295798,0.883267,0.882586,0.877584,0.887645,0.950768,0.950768,0.950321
9,0.2077,0.355113,0.868,0.876666,0.938392,0.82256,0.94993,0.94993,0.94938


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.4414,0.263179,0.890733,0.881669,0.814242,0.961272,0.97019,0.97019,0.969735
1,0.2409,0.200768,0.921867,0.923559,0.944126,0.903868,0.977549,0.977549,0.976894
2,0.1966,0.312719,0.890333,0.878804,0.795306,0.98189,0.979827,0.979827,0.979532
3,0.1733,0.177774,0.931867,0.932398,0.939859,0.925056,0.98119,0.98119,0.981008
4,0.1653,0.227882,0.9188,0.914189,0.865182,0.969081,0.981425,0.981425,0.981546
5,0.1445,0.209098,0.923267,0.919617,0.877984,0.965396,0.981925,0.981925,0.982166
6,0.1349,0.200828,0.9296,0.927393,0.89932,0.957275,0.981694,0.981694,0.982098
7,0.1329,0.217016,0.9286,0.926031,0.893986,0.960458,0.982271,0.982271,0.982844
8,0.1114,0.188351,0.933133,0.931755,0.913055,0.951236,0.982581,0.982581,0.98362


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/27791 [00:00<?, ?ex/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,No log,0.637141,0.650276,0.721224,0.898095,0.602556,0.7617,0.7617,0.763866
2,0.638500,0.637212,0.670425,0.730165,0.885238,0.621324,0.771712,0.771712,0.772205
3,0.638500,0.660938,0.653634,0.538068,0.400476,0.819688,0.775239,0.775239,0.776302
4,0.579500,0.569786,0.69705,0.691424,0.67381,0.709985,0.777975,0.777975,0.779194
5,0.579500,0.725942,0.656752,0.540019,0.4,0.830861,0.778909,0.778909,0.780747
6,0.543000,0.576237,0.69657,0.721427,0.78,0.671036,0.779271,0.779271,0.781553
7,0.543000,0.57523,0.69657,0.717571,0.765238,0.675494,0.77813,0.77813,0.780415
8,0.502000,0.602294,0.698489,0.716956,0.758095,0.680051,0.779144,0.779144,0.780768
9,0.502000,0.63303,0.686975,0.720736,0.801905,0.654489,0.777184,0.777184,0.779546


***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-261
Configuration saved in outputs/checkpoint-261/config.json
Model weights saved in outputs/checkpoint-261/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-261/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-261/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-522
Configuration saved in outputs/checkpoint-522/config.json
Model weights saved in outputs/checkpoint-522/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-522/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-522/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-783
Configuration saved in outputs/checkpoint-783/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/123872 [00:00<?, ?ex/s]

  0%|          | 0/124 [00:00<?, ?ba/s]

  0%|          | 0/124 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.589,0.537988,0.735698,0.722965,0.685641,0.764587,0.813031,0.813031,0.803416
1,0.5208,0.521988,0.737904,0.770953,0.876953,0.687815,0.841642,0.841642,0.838189
2,0.4929,0.508108,0.749206,0.78085,0.888294,0.696593,0.853107,0.853107,0.853026
3,0.4573,0.467618,0.773747,0.784984,0.8211,0.751911,0.860745,0.860745,0.861569
4,0.4342,0.457283,0.786664,0.791763,0.806334,0.777709,0.867022,0.867022,0.865452
5,0.4121,0.544205,0.755019,0.789337,0.912476,0.695482,0.868947,0.868947,0.868082
6,0.394,0.523839,0.756041,0.790032,0.912476,0.696561,0.869531,0.869531,0.867299
7,0.3727,0.511268,0.771918,0.797477,0.892788,0.720553,0.872595,0.872595,0.870383
8,0.3508,0.483665,0.790001,0.80275,0.849561,0.760828,0.873064,0.873064,0.869904
9,0.324,0.559181,0.76223,0.794874,0.9159,0.7021,0.872702,0.872702,0.868708


***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1161
Configuration saved in outputs/checkpoint-1161/config.json
Model weights saved in outputs/checkpoint-1161/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1161/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1161/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2322
Configuration saved in outputs/checkpoint-2322/config.json
Model weights saved in outputs/checkpoint-2322/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-2322/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-2322/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-3483
Configuration saved in outputs/checkpoint-3483/config.json
Model weights sa

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/25284 [00:00<?, ?ex/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

  0%|          | 0/26 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,No log,0.351924,0.880308,0.680886,0.557796,0.873684,0.867825,0.867825,0.768506
1,No log,0.412571,0.837846,0.694138,0.803763,0.610827,0.890693,0.890693,0.816737
2,0.386000,0.314599,0.888615,0.755405,0.751344,0.759511,0.902292,0.902292,0.838139
3,0.386000,0.256909,0.912,0.789086,0.719086,0.874183,0.913612,0.913612,0.859034
4,0.289900,0.26998,0.903385,0.784341,0.767473,0.801966,0.919733,0.919733,0.868141
5,0.289900,0.411083,0.838154,0.709713,0.864247,0.60206,0.926409,0.926409,0.881413
6,0.289900,0.239711,0.918769,0.805596,0.735215,0.890879,0.928694,0.928694,0.885639
7,0.226800,0.264367,0.920923,0.795545,0.672043,0.974659,0.932258,0.932258,0.890557
8,0.226800,0.449071,0.821231,0.691777,0.876344,0.571429,0.93346,0.93346,0.893037
9,0.195400,0.394135,0.846462,0.722006,0.870968,0.616556,0.934838,0.934838,0.896732


***** Running Evaluation *****
  Num examples = 3250
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-203
Configuration saved in outputs/checkpoint-203/config.json
Model weights saved in outputs/checkpoint-203/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-203/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-203/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-406
Configuration saved in outputs/checkpoint-406/config.json
Model weights saved in outputs/checkpoint-406/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-406/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-406/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3250
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-609
Configuration saved in outputs/checkpoint-609/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


  0%|          | 0/139804 [00:00<?, ?ex/s]

  0%|          | 0/140 [00:00<?, ?ba/s]

  0%|          | 0/140 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.19.2",

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,0.6699,0.637302,0.63688,0.577437,0.490386,0.702064,0.71372,0.71372,0.699333
2,0.6159,0.608055,0.663774,0.714129,0.830066,0.62661,0.738554,0.738554,0.724971
3,0.5933,0.622405,0.655238,0.59382,0.498115,0.735049,0.746098,0.746098,0.731554
4,0.5728,0.620742,0.67498,0.645812,0.585674,0.719713,0.752352,0.752352,0.73979
5,0.5646,0.587065,0.685995,0.709284,0.757116,0.667137,0.755753,0.755753,0.743848
6,0.5457,0.623846,0.676363,0.647374,0.587182,0.721315,0.757636,0.757636,0.746434
7,0.541,0.595349,0.684946,0.715571,0.783318,0.65861,0.756193,0.756193,0.744424
8,0.5164,0.591672,0.684088,0.711618,0.770405,0.661166,0.753384,0.753384,0.742172
9,0.5069,0.605801,0.678604,0.717874,0.8082,0.645708,0.750627,0.750627,0.740015
10,0.4852,0.684876,0.668828,0.639763,0.581244,0.711385,0.746025,0.746025,0.736412


***** Running Evaluation *****
  Num examples = 20971
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1311
Configuration saved in outputs/checkpoint-1311/config.json
Model weights saved in outputs/checkpoint-1311/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1311/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1311/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20971
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2622
Configuration saved in outputs/checkpoint-2622/config.json
Model weights saved in outputs/checkpoint-2622/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-2622/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-2622/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 20971
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-3933
Configuration saved in outputs/checkpoint-3933/config.json
Model weights sa

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


## Outputs

In [9]:
outputs_df = pd.DataFrame(outputs)
outputs_df

Unnamed: 0,dataset,test_acc,test_f1,test_loss,test_precision,test_recall,test_auroc_macro,test_auroc_weighted,test_pr_auc,min_valid_loss_epoch,min_valid_loss_log
0,demo_coding_vs_intergenomic_seqs,0.87652,0.874077,0.290276,0.891719,0.85712,0.949646,0.949646,0.950382,7.0,"{'eval_loss': 0.2834126949310303, 'eval_accura..."
1,demo_human_or_worm,0.93236,0.933022,0.177359,0.923982,0.94224,0.981574,0.981574,0.981312,4.0,"{'eval_loss': 0.1777740865945816, 'eval_accura..."
2,human_enhancers_cohn,0.701209,0.693805,0.574378,0.711434,0.677029,0.776039,0.776039,0.771165,4.0,"{'eval_loss': 0.5697855949401855, 'eval_accura..."
3,human_enhancers_ensembl,0.787504,0.792443,0.45615,0.774442,0.811301,0.867828,0.867828,0.863502,5.0,"{'eval_loss': 0.45728257298469543, 'eval_accur..."
4,human_nontata_promoters,0.836839,0.826466,0.544922,0.980721,0.71414,0.926644,0.926644,0.951273,11.0,"{'eval_loss': 0.2309839427471161, 'eval_accura..."
5,human_ocr_ensembl,0.682994,0.704659,0.593976,0.659581,0.756352,0.750149,0.750149,0.732775,5.0,"{'eval_loss': 0.5870654582977295, 'eval_accura..."


In [10]:
# outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

In [11]:
# saving outputs to csv file
outputs_df.to_csv(OUTPUT_PATH, index=False)