In [1]:
!pip install -qq transformers genomic-benchmarks datasets

[K     |████████████████████████████████| 4.4 MB 14.8 MB/s 
[K     |████████████████████████████████| 362 kB 56.1 MB/s 
[K     |████████████████████████████████| 101 kB 12.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 43.4 MB/s 
[K     |████████████████████████████████| 596 kB 17.2 MB/s 
[K     |████████████████████████████████| 2.3 MB 44.7 MB/s 
[K     |████████████████████████████████| 271 kB 48.6 MB/s 
[K     |████████████████████████████████| 140 kB 45.8 MB/s 
[K     |████████████████████████████████| 212 kB 54.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 18.7 MB/s 
[K     |████████████████████████████████| 127 kB 46.5 MB/s 
[K     |████████████████████████████████| 94 kB 4.3 MB/s 
[K     |████████████████████████████████| 144 kB 61.4 MB/s 
[?25h  Building wheel for genomic-benchmarks (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is t

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
### Parameters
MODEL_NAME = "armheb/DNA_bert_6" #Original DNABert model
TOKENIZER_NAME = "armheb/DNA_bert_6"
K = 6
STRIDE = 1

# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 1

BENCHMARKS_FOLDER = '/root/.genomic_benchmarks'
# BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks/' (for INFRA HUB)

DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
 ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_nontata_promoters', 0)]

BATCH_SIZE = 16
ACCUMULATION = 4
LEARNING_RATE = 1e-5
EPOCHS = 4
RUNS = 5

# do not forget to attach drive
OUTPUT_PATH = 'drive/MyDrive/genomic_benchmarks/DNABERT.csv'

## Download benchmark datasets and tokenizer

In [2]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path
from tqdm.autonotebook import tqdm

for dataset_name, dataset_version in tqdm(DATASETS):
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version, use_cloud_cache=True)

benchmark_root = Path(BENCHMARKS_FOLDER)

  from tqdm.autonotebook import tqdm


  0%|          | 0/4 [00:00<?, ?it/s]

Downloading...
From: https://drive.google.com/uc?id=1cpXg0ULuTGF7h1_HTYvc6p8M-ee43t-v
To: /root/.genomic_benchmarks/demo_coding_vs_intergenomic_seqs.zip

  0%|          | 0.00/33.9M [00:00<?, ?B/s][A
100%|██████████| 33.9M/33.9M [00:00<00:00, 227MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JW0-eTB-rJXvFcglqBo3pFZi1kyIWC3X
To: /root/.genomic_benchmarks/demo_human_or_worm.zip

  0%|          | 0.00/28.9M [00:00<?, ?B/s][A
100%|██████████| 28.9M/28.9M [00:00<00:00, 231MB/s]
Downloading...
From: https://drive.google.com/uc?id=176563cDPQ5Y094WyoSBF02QjoVQhWuCh
To: /root/.genomic_benchmarks/human_enhancers_cohn.zip

  0%|          | 0.00/11.9M [00:00<?, ?B/s][A
100%|██████████| 11.9M/11.9M [00:00<00:00, 52.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1VdUg0Zu8yfLS6QesBXwGz1PIQrTW3Ze4
To: /root/.genomic_benchmarks/human_nontata_promoters.zip

  0%|          | 0.00/11.8M [00:00<?, ?B/s][A
100%|██████████| 11.8M/11.8M [00:00<00:00, 85.9MB/s]


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [4]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

# example
tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    

{'input_ids': [2, 501, 1989, 3848, 3089, 56, 212, 835, 3325, 999, 3983, 3629, 2214, 650, 2587, 2142, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [5]:
import pandas as pd
import numpy as np
from random import random, randrange
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
from random import random, randrange

def compute_metrics_binary(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics_multi(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    

    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })

    compute_metrics = compute_metrics_binary if len(labels) == 2 else compute_metrics_multi

    for _ in range(RUNS):

        model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))

        args = TrainingArguments('outputs', learning_rate=LEARNING_RATE, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
            evaluation_strategy="epoch", per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE*2, gradient_accumulation_steps=ACCUMULATION,
            num_train_epochs=EPOCHS, weight_decay=0.01, save_steps=100000, seed=randrange(1,10001), report_to='none')
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics)
        trainer.train()
        
        max_accuracy = max([x['eval_accuracy'] for x in trainer.state.log_history if 'eval_accuracy' in x])
        max_f1 = max([x['eval_f1'] for x in trainer.state.log_history if 'eval_f1' in x]) if len(labels) == 2 else np.nan
        train_runtime = max([x['train_runtime'] for x in trainer.state.log_history if 'train_runtime' in x])
        
        outputs.append((dataset_name, max_accuracy, max_f1, train_runtime))
        outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
        outputs_df.to_csv(OUTPUT_PATH, index=False)



  0%|          | 0/4 [00:00<?, ?it/s]



  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/343M [00:00<?, ?B/s]

Some weights of the model checkpoint at armheb/DNA_bert_6 were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at armheb/DNA_bert_6 and are n

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2238,0.211715,0.91596,0.914763
2,0.1934,0.193344,0.92428,0.924765
3,0.173,0.21045,0.91952,0.922245
4,0.1643,0.199079,0.92328,0.925008


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2218,0.213207,0.91344,0.916615
2,0.1895,0.195898,0.92324,0.924404
3,0.1723,0.201279,0.92128,0.923715
4,0.1619,0.200362,0.92236,0.924572


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2249,0.203738,0.91984,0.919324
2,0.1927,0.196514,0.92252,0.924084
3,0.1713,0.201538,0.92064,0.923255
4,0.1647,0.196395,0.92452,0.926032


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2237,0.223689,0.91324,0.915435
2,0.1906,0.203788,0.91932,0.922175
3,0.1768,0.195386,0.92316,0.925087
4,0.1662,0.196365,0.92392,0.925639


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2229,0.207749,0.91752,0.918614
2,0.1916,0.201359,0.91984,0.922494
3,0.1752,0.193579,0.92468,0.926183
4,0.1626,0.194693,0.92528,0.926745


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.20.1",
  "typ

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1079,0.130974,0.95044,0.948407
2,0.0862,0.09402,0.9658,0.965358
3,0.0727,0.1064,0.9632,0.962366
4,0.0634,0.102777,0.96488,0.964219


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1089,0.093534,0.96536,0.965368
2,0.0833,0.088328,0.96664,0.966053
3,0.0715,0.093082,0.96696,0.966445
4,0.0641,0.105397,0.9644,0.963656


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1081,0.100962,0.96284,0.962231
2,0.0842,0.100241,0.96352,0.962739
3,0.0719,0.099487,0.96528,0.964629
4,0.0648,0.104816,0.96484,0.964179


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1124,0.119803,0.95416,0.952594
2,0.0869,0.090849,0.96692,0.966484
3,0.0708,0.113691,0.96056,0.959494
4,0.0641,0.108166,0.96236,0.961549


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1097,0.11938,0.95556,0.954078
2,0.0888,0.096158,0.96512,0.964567
3,0.071,0.091768,0.9674,0.967072
4,0.0624,0.100066,0.9654,0.964816


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/27791 [00:00<?, ?ex/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.20.1",
  "typ

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.5222,0.731865,0.739768
1,0.576500,0.524734,0.729994,0.703821
2,0.576500,0.510775,0.743235,0.740924
3,0.506400,0.51639,0.739206,0.727519


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.517726,0.736183,0.739963
1,0.561100,0.512038,0.743955,0.746978
2,0.561100,0.526586,0.735463,0.710186
3,0.505500,0.515568,0.740069,0.731888


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.533309,0.723518,0.699421
1,0.580900,0.517604,0.739062,0.732082
2,0.580900,0.514786,0.742372,0.734027
3,0.509500,0.519797,0.743092,0.727273


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.519704,0.735895,0.740416
1,0.565900,0.520884,0.73791,0.716664
2,0.565900,0.514972,0.74122,0.729217
3,0.504700,0.512966,0.740933,0.733491


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.530722,0.722942,0.697089
1,0.573600,0.522604,0.735032,0.713285
2,0.573600,0.508334,0.743955,0.746689
3,0.507500,0.51242,0.741796,0.731598


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/36131 [00:00<?, ?ex/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_rnn_layer": 1,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rnn": "lstm",
  "rnn_dropout": 0.0,
  "rnn_hidden": 768,
  "split": 10,
  "transformers_version": "4.20.1",
  "typ

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.355288,0.838278,0.832088
1,0.438700,0.31754,0.862077,0.863587
2,0.321200,0.321332,0.860527,0.858776
3,0.298900,0.322994,0.860859,0.858907


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.337935,0.853332,0.856462
1,0.435900,0.317268,0.862187,0.863502
2,0.324400,0.319008,0.859088,0.858822
3,0.297800,0.319686,0.858977,0.858192


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.342272,0.850122,0.849889
1,0.446300,0.318575,0.861302,0.862353
2,0.324300,0.318335,0.860638,0.860033
3,0.301200,0.318961,0.860638,0.859846


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.352575,0.844809,0.841653
1,0.450100,0.327458,0.855103,0.852905
2,0.326200,0.33335,0.852225,0.848002
3,0.300300,0.323518,0.857649,0.855571


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/armheb/DNA_bert_6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2697389de18c4fe8c3497cea35aaf65130fdd59c3ab64cb6b1c2e0632fefdaf0.3a7e1ca237211e6405270f85616f49989aeee994db35f6593a40c7b5081a50d0
Model config BertConfig {
  "_name_or_path": "armheb/DNA_bert_6",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.339695,0.852668,0.854329
1,0.440200,0.317458,0.861523,0.865325
2,0.321300,0.324659,0.859531,0.857847
3,0.297400,0.318697,0.862187,0.862019


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




## Outputs

In [6]:
outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
outputs_df

Unnamed: 0,dataset,accuracy,f1,train_runtime
0,demo_coding_vs_intergenomic_seqs,0.92428,0.925008,1639.193
1,demo_coding_vs_intergenomic_seqs,0.92324,0.924572,1636.2917
2,demo_coding_vs_intergenomic_seqs,0.92452,0.926032,1631.4838
3,demo_coding_vs_intergenomic_seqs,0.92392,0.925639,1632.8869
4,demo_coding_vs_intergenomic_seqs,0.92528,0.926745,1628.0106
5,demo_human_or_worm,0.9658,0.965358,1630.5597
6,demo_human_or_worm,0.96696,0.966445,1635.4404
7,demo_human_or_worm,0.96528,0.964629,1637.4602
8,demo_human_or_worm,0.96692,0.966484,1634.9699
9,demo_human_or_worm,0.9674,0.967072,1633.4275


In [7]:
outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

Unnamed: 0_level_0,accuracy,accuracy,f1,f1,train_runtime,train_runtime
Unnamed: 0_level_1,mean,sem,mean,sem,mean,sem
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
demo_coding_vs_intergenomic_seqs,0.924248,0.000336,0.925599,0.000381,1633.5732,1.933281
demo_human_or_worm,0.966472,0.000398,0.965998,0.00044,1634.37154,1.150253
human_enhancers_cohn,0.743092,0.000501,0.741807,0.002386,1182.24632,0.94224
human_nontata_promoters,0.86108,0.000874,0.862067,0.001692,704.41262,0.567971


In [8]:
# saving outputs to csv file
outputs_df.to_csv(OUTPUT_PATH, index=False)