In [1]:
!pip install -qq transformers genomic-benchmarks datasets

[K     |████████████████████████████████| 4.4 MB 14.7 MB/s 
[K     |████████████████████████████████| 362 kB 53.6 MB/s 
[K     |████████████████████████████████| 596 kB 58.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 21.2 MB/s 
[K     |████████████████████████████████| 101 kB 7.0 MB/s 
[K     |████████████████████████████████| 2.3 MB 62.7 MB/s 
[K     |████████████████████████████████| 271 kB 75.5 MB/s 
[K     |████████████████████████████████| 140 kB 72.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 64.4 MB/s 
[K     |████████████████████████████████| 212 kB 75.8 MB/s 
[K     |████████████████████████████████| 127 kB 73.5 MB/s 
[K     |████████████████████████████████| 94 kB 4.2 MB/s 
[K     |████████████████████████████████| 144 kB 55.0 MB/s 
[?25h  Building wheel for genomic-benchmarks (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is th

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
### Parameters
MODEL_NAME = "Vlasta/DNADebertaK7" #Original DNABert model
TOKENIZER_NAME = "armheb/DNA_bert_6"
K = 7
STRIDE = 1

# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 1

BENCHMARKS_FOLDER = '/root/.genomic_benchmarks'
# BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks/' (for INFRA HUB)

DATASETS = list(reversed([('demo_coding_vs_intergenomic_seqs', 0),
 ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_nontata_promoters', 0)]))

BATCH_SIZE = 8
ACCUMULATION = 8

LEARNING_RATE = 1e-5
EPOCHS = 4
RUNS = 5

# do not forget to attach drive
OUTPUT_PATH = 'drive/MyDrive/genomic_benchmarks/DNADebertaK7_2.csv'

## Download benchmark datasets and tokenizer

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [3]:
from itertools import product

alphabet = ('A', 'C', 'T', 'G')
vocab = list(map(''.join, product(alphabet, repeat=K)))

tokenizer.add_tokens(vocab)

16384

In [4]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path
from tqdm.autonotebook import tqdm

for dataset_name, dataset_version in tqdm(DATASETS):
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version, use_cloud_cache=True)

benchmark_root = Path(BENCHMARKS_FOLDER)

  0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

# example
tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    

{'input_ids': [2, 7109, 16136, 19473, 16440, 4308, 4930, 7417, 17366, 8010, 19737, 17495, 8527, 5422, 9387, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [6]:
import pandas as pd
import numpy as np
from random import random, randrange
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric

def compute_metrics_binary(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics_multi(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    

    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })

    compute_metrics = compute_metrics_binary if len(labels) == 2 else compute_metrics_multi

    for _ in range(RUNS):

        model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))

        args = TrainingArguments('outputs', learning_rate=LEARNING_RATE, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
            evaluation_strategy="epoch", per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE*2, gradient_accumulation_steps=ACCUMULATION,
            num_train_epochs=EPOCHS, weight_decay=0.01, save_steps=100000, seed=randrange(1,10001), report_to='none')
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics)
        trainer.train()
        
        max_accuracy = max([x['eval_accuracy'] for x in trainer.state.log_history if 'eval_accuracy' in x])
        max_f1 = max([x['eval_f1'] for x in trainer.state.log_history if 'eval_f1' in x]) if len(labels) == 2 else np.nan
        train_runtime = max([x['train_runtime'] for x in trainer.state.log_history if 'train_runtime' in x])
        
        outputs.append((dataset_name, max_accuracy, max_f1, train_runtime))
        outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
        outputs_df.to_csv(OUTPUT_PATH, index=False)




  0%|          | 0/4 [00:00<?, ?it/s]



  0%|          | 0/36131 [00:00<?, ?ex/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/705 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226M [00:00<?, ?B/s]

Some weights of the model checkpoint at Vlasta/DNADebertaK7 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at Vlasta/DNADeb

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.329131,0.865619,0.869659
1,0.425900,0.296778,0.883662,0.891817
2,0.265000,0.28572,0.890414,0.898524
3,0.203100,0.294687,0.88975,0.89843


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_at

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.331567,0.859309,0.859294
1,0.427500,0.310238,0.866726,0.880627
2,0.265300,0.278419,0.892849,0.900432
3,0.205900,0.289554,0.890857,0.899593


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 51

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.340409,0.856321,0.865659
1,0.425100,0.310296,0.875138,0.885899
2,0.267200,0.313076,0.88012,0.891862
3,0.201100,0.303066,0.886872,0.896872


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 51

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.342195,0.860638,0.860899
1,0.430900,0.290337,0.884769,0.892934
2,0.265800,0.309542,0.87857,0.891008
3,0.205200,0.309974,0.880894,0.892615


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 51

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.34749,0.854771,0.865132
1,0.427300,0.278308,0.892849,0.896293
2,0.260100,0.292787,0.885544,0.895913
3,0.192700,0.293506,0.888311,0.898235


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/27791 [00:00<?, ?ex/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.531459,0.728699,0.72866
1,0.558700,0.526175,0.732585,0.728759
2,0.558700,0.532901,0.73129,0.751696
3,0.481100,0.544331,0.72582,0.749111


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 51

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.532914,0.729275,0.742364
1,0.560200,0.525976,0.736615,0.738721
2,0.560200,0.537423,0.729562,0.740147
3,0.482100,0.546365,0.728267,0.744934


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 51

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.53061,0.730426,0.737638
1,0.559100,0.524221,0.73388,0.7464
2,0.559100,0.540457,0.728555,0.750859
3,0.482700,0.54524,0.728555,0.747794


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 51

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.541771,0.72323,0.696064
1,0.559000,0.534661,0.731146,0.756136
2,0.559000,0.540226,0.729275,0.753957
3,0.485400,0.542066,0.731577,0.751234


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 51

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.541374,0.729131,0.757036
1,0.559200,0.524653,0.736039,0.746685
2,0.559200,0.540201,0.732729,0.757223
3,0.484300,0.541029,0.732009,0.751269


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1322,0.108084,0.9594,0.958862
1,0.0874,0.09432,0.96652,0.966156
2,0.0609,0.097906,0.96716,0.966792
3,0.0463,0.114489,0.96512,0.964544


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings"

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1327,0.123778,0.95648,0.955201
1,0.0847,0.093983,0.96512,0.964587
2,0.0596,0.116889,0.96168,0.960734
3,0.0466,0.119919,0.9642,0.963519


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings"

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1345,0.099301,0.9638,0.963711
1,0.0836,0.095749,0.96592,0.965459
2,0.0587,0.117947,0.96208,0.96116
3,0.0459,0.114382,0.96684,0.966332


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings"

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1302,0.131037,0.95168,0.949871
1,0.0851,0.095956,0.9656,0.965174
2,0.06,0.105451,0.96568,0.965122
3,0.046,0.112368,0.96632,0.965856


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings"

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1378,0.105887,0.9606,0.960357
1,0.0872,0.091236,0.96664,0.966597
2,0.0593,0.102782,0.96608,0.965587
3,0.0463,0.118086,0.964,0.963325


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2442,0.218233,0.91264,0.913737
1,0.1965,0.217053,0.91276,0.910177
2,0.1639,0.223918,0.91664,0.915133
3,0.1533,0.221821,0.91824,0.917746


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings"

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2461,0.220175,0.91144,0.909158
1,0.201,0.213151,0.91816,0.918447
2,0.1655,0.216977,0.9184,0.918223
3,0.1461,0.221541,0.91808,0.918413


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings"

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2465,0.226502,0.91052,0.909525
1,0.1972,0.214769,0.9174,0.916441
2,0.162,0.224058,0.91808,0.916944
3,0.1451,0.229637,0.9176,0.916613


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings"

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2485,0.21818,0.91096,0.910495
1,0.1972,0.208776,0.91656,0.915155
2,0.1622,0.216425,0.91812,0.91841
3,0.1481,0.221835,0.91828,0.91812


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/Vlasta/DNADebertaK7/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3fa25099222ee8392148633f9da2ccd667d7c3946c0cb204e54570049d533aca.c59d09939f4655d9540dd690f39534d728c6afa978a255af7840e17b5db6c906
Model config DebertaConfig {
  "_name_or_path": "Vlasta/DNADebertaK7",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings"

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2426,0.218062,0.91252,0.912123
1,0.1955,0.219233,0.91172,0.908404
2,0.1651,0.217906,0.91792,0.916876
3,0.1506,0.223474,0.91948,0.91938


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




## Outputs

In [7]:
outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
outputs_df

Unnamed: 0,dataset,accuracy,f1,train_runtime
0,human_nontata_promoters,0.890414,0.898524,642.9388
1,human_nontata_promoters,0.892849,0.900432,638.0277
2,human_nontata_promoters,0.886872,0.896872,635.4103
3,human_nontata_promoters,0.884769,0.892934,636.288
4,human_nontata_promoters,0.892849,0.898235,638.6828
5,human_enhancers_cohn,0.732585,0.751696,906.7777
6,human_enhancers_cohn,0.736615,0.744934,908.8487
7,human_enhancers_cohn,0.73388,0.750859,906.347
8,human_enhancers_cohn,0.731577,0.756136,907.6432
9,human_enhancers_cohn,0.736039,0.757223,909.9677


In [8]:
outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

Unnamed: 0_level_0,accuracy,accuracy,f1,f1,train_runtime,train_runtime
Unnamed: 0_level_1,mean,sem,mean,sem,mean,sem
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
demo_coding_vs_intergenomic_seqs,0.918496,0.000251,0.918186,0.000405,1664.96498,5.181971
demo_human_or_worm,0.966416,0.000352,0.966033,0.000394,1656.57656,4.475498
human_enhancers_cohn,0.734139,0.000969,0.752169,0.002186,907.91686,0.667507
human_nontata_promoters,0.889551,0.001621,0.897399,0.001253,638.26952,1.306388


In [9]:
# saving outputs to csv file
outputs_df.to_csv(OUTPUT_PATH, index=False)