In [1]:
!pip install -qq transformers genomic-benchmarks datasets

[K     |████████████████████████████████| 4.4 MB 14.6 MB/s 
[K     |████████████████████████████████| 362 kB 52.8 MB/s 
[K     |████████████████████████████████| 101 kB 14.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 23.9 MB/s 
[K     |████████████████████████████████| 596 kB 45.2 MB/s 
[K     |████████████████████████████████| 2.3 MB 19.8 MB/s 
[K     |████████████████████████████████| 271 kB 27.0 MB/s 
[K     |████████████████████████████████| 140 kB 25.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 53.1 MB/s 
[K     |████████████████████████████████| 212 kB 53.6 MB/s 
[K     |████████████████████████████████| 127 kB 58.8 MB/s 
[K     |████████████████████████████████| 144 kB 34.3 MB/s 
[K     |████████████████████████████████| 94 kB 2.7 MB/s 
[?25h  Building wheel for genomic-benchmarks (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is t

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
### Parameters
MODEL_NAME = "davidcechak/DNADebertaK8" #Original DNABert model
TOKENIZER_NAME = "armheb/DNA_bert_6"
K = 8
STRIDE = 1

# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 1

BENCHMARKS_FOLDER = '/root/.genomic_benchmarks'
# BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks/' (for INFRA HUB)

DATASETS = list(reversed([('demo_coding_vs_intergenomic_seqs', 0),
 ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_nontata_promoters', 0)]))

BATCH_SIZE = 4
ACCUMULATION = 16

LEARNING_RATE = 1e-5
EPOCHS = 4
RUNS = 5

# do not forget to attach drive
OUTPUT_PATH = 'drive/MyDrive/genomic_benchmarks/DNADebertaK8_2.csv'

## Download benchmark datasets and tokenizer

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [12]:
from itertools import product

alphabet = ('A', 'C', 'T', 'G')
vocab = list(map(''.join, product(alphabet, repeat=K)))

tokenizer.add_tokens(vocab)

65536

In [13]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path
from tqdm.autonotebook import tqdm

for dataset_name, dataset_version in tqdm(DATASETS):
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version, use_cloud_cache=True)

benchmark_root = Path(BENCHMARKS_FOLDER)

  0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

# example
tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    

{'input_ids': [2, 16136, 52241, 65592, 53460, 4930, 7417, 17366, 57162, 19737, 66647, 57679, 21806, 9387, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [15]:
import pandas as pd
import numpy as np
from random import random, randrange
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric

def compute_metrics_binary(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics_multi(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    

    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })

    compute_metrics = compute_metrics_binary if len(labels) == 2 else compute_metrics_multi

    for _ in range(RUNS):

        model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))

        args = TrainingArguments('outputs', learning_rate=LEARNING_RATE, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
            evaluation_strategy="epoch", per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE*2, gradient_accumulation_steps=ACCUMULATION,
            num_train_epochs=EPOCHS, weight_decay=0.01, save_steps=100000, seed=randrange(1,10001), report_to='none')
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics)
        trainer.train()
        
        max_accuracy = max([x['eval_accuracy'] for x in trainer.state.log_history if 'eval_accuracy' in x])
        max_f1 = max([x['eval_f1'] for x in trainer.state.log_history if 'eval_f1' in x]) if len(labels) == 2 else np.nan
        train_runtime = max([x['train_runtime'] for x in trainer.state.log_history if 'train_runtime' in x])
        
        outputs.append((dataset_name, max_accuracy, max_f1, train_runtime))
        outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
        outputs_df.to_csv(OUTPUT_PATH, index=False)




  0%|          | 0/4 [00:00<?, ?it/s]



  0%|          | 0/36131 [00:00<?, ?ex/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/705 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/370M [00:00<?, ?B/s]

Some weights of the model checkpoint at davidcechak/DNADebertaK8 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at davidcec

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.319913,0.872814,0.873972
1,0.404700,0.231313,0.91698,0.922601
2,0.194400,0.23495,0.92816,0.931778
3,0.112600,0.23374,0.929046,0.932718


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.316865,0.87547,0.882063
1,0.409900,0.239474,0.913106,0.9158
2,0.202100,0.228178,0.927496,0.931478
3,0.116100,0.232298,0.928382,0.932485


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.297493,0.880784,0.88719
1,0.404600,0.232837,0.915431,0.918637
2,0.190300,0.215014,0.928492,0.933512
3,0.108300,0.221875,0.929931,0.935163


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.311293,0.873257,0.884144
1,0.404300,0.317137,0.871485,0.889544
2,0.196300,0.21875,0.92971,0.933598
3,0.112300,0.225277,0.930928,0.934673


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.309294,0.873589,0.880544
1,0.408000,0.238496,0.909785,0.914811
2,0.204300,0.237601,0.921629,0.926678
3,0.114700,0.240335,0.922958,0.927965


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/27791 [00:00<?, ?ex/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.530788,0.727691,0.743144
1,0.550900,0.534319,0.738342,0.749587
2,0.550900,0.558302,0.729562,0.742567
3,0.446200,0.588095,0.725245,0.744957


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.530306,0.731577,0.749698
1,0.553700,0.531541,0.731002,0.747535
2,0.553700,0.559587,0.726108,0.743635
3,0.453100,0.580924,0.724381,0.741809


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.532937,0.725964,0.713598
1,0.553700,0.534147,0.731434,0.751067
2,0.553700,0.560948,0.728699,0.744337
3,0.454100,0.583936,0.724957,0.744621


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.529768,0.729706,0.746422
1,0.553500,0.530761,0.738774,0.748441
2,0.553500,0.565035,0.732729,0.738414
3,0.447800,0.584874,0.72726,0.740731


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.527779,0.733736,0.731026
1,0.554400,0.532001,0.736903,0.747723
2,0.554400,0.553132,0.731577,0.738978
3,0.454900,0.579599,0.728699,0.743293


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1342,0.115579,0.95868,0.959281
1,0.0786,0.100322,0.96392,0.963271
2,0.0423,0.129272,0.96388,0.963252
3,0.0264,0.129825,0.966,0.965715


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1378,0.130939,0.9498,0.947858
1,0.0763,0.098302,0.96452,0.964898
2,0.0417,0.112762,0.96728,0.967077
3,0.0279,0.131051,0.9676,0.967436


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1359,0.106936,0.9592,0.95853
1,0.082,0.10054,0.9658,0.965829
2,0.0406,0.119423,0.9674,0.967362
3,0.0268,0.137877,0.96628,0.966056


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1328,0.103536,0.9628,0.962512
1,0.0775,0.098954,0.96624,0.965957
2,0.0403,0.120476,0.9658,0.965464
3,0.0277,0.133446,0.96672,0.966546


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1341,0.103953,0.96024,0.960379
1,0.0798,0.102014,0.9642,0.963849
2,0.0406,0.11803,0.96664,0.966484
3,0.027,0.135977,0.966,0.965775


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2438,0.220389,0.91172,0.909233
1,0.1811,0.217493,0.91956,0.919006
2,0.1291,0.241395,0.91824,0.917334
3,0.0982,0.26674,0.91812,0.917827


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2398,0.219042,0.9152,0.916066
1,0.1813,0.207685,0.92156,0.922093
2,0.1272,0.22986,0.92092,0.921445
3,0.1,0.260327,0.91948,0.919612


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2429,0.222912,0.91212,0.910418
1,0.1809,0.212112,0.91748,0.919203
2,0.1267,0.239469,0.9192,0.919129
3,0.1011,0.260249,0.91884,0.918772


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2439,0.219314,0.91092,0.912319
1,0.1788,0.218424,0.91652,0.914422
2,0.1248,0.2388,0.918,0.918196
3,0.1036,0.262418,0.91764,0.917906


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2496,0.215083,0.91528,0.91526
1,0.1837,0.223867,0.91744,0.915486
2,0.1281,0.235714,0.91844,0.919455
3,0.1009,0.260707,0.91788,0.917857


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




## Outputs

In [16]:
outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
outputs_df

Unnamed: 0,dataset,accuracy,f1,train_runtime
0,human_nontata_promoters,0.929046,0.932718,1092.8379
1,human_nontata_promoters,0.928382,0.932485,1064.4555
2,human_nontata_promoters,0.929931,0.935163,1056.332
3,human_nontata_promoters,0.930928,0.934673,1069.6667
4,human_nontata_promoters,0.922958,0.927965,1082.1928
5,human_enhancers_cohn,0.738342,0.749587,1096.2365
6,human_enhancers_cohn,0.731577,0.749698,1082.7595
7,human_enhancers_cohn,0.731434,0.751067,1080.2603
8,human_enhancers_cohn,0.738774,0.748441,1077.6469
9,human_enhancers_cohn,0.736903,0.747723,1089.8594


In [17]:
outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

Unnamed: 0_level_0,accuracy,accuracy,f1,f1,train_runtime,train_runtime
Unnamed: 0_level_1,mean,sem,mean,sem,mean,sem
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
demo_coding_vs_intergenomic_seqs,0.919352,0.000616,0.919591,0.00066,2944.7175,24.436089
demo_human_or_worm,0.966872,0.000287,0.966709,0.000318,2974.60698,7.428487
human_enhancers_cohn,0.735406,0.001622,0.749303,0.000574,1085.35252,3.397087
human_nontata_promoters,0.928249,0.00139,0.932601,0.001272,1073.09698,6.480526


In [18]:
# saving outputs to csv file
outputs_df.to_csv(OUTPUT_PATH, index=False)