In [None]:
!pip install -qq transformers genomic-benchmarks datasets

[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
[K     |████████████████████████████████| 362 kB 78.3 MB/s 
[K     |████████████████████████████████| 101 kB 14.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 75.0 MB/s 
[K     |████████████████████████████████| 596 kB 69.3 MB/s 
[K     |████████████████████████████████| 2.3 MB 64.0 MB/s 
[K     |████████████████████████████████| 271 kB 72.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 53.2 MB/s 
[K     |████████████████████████████████| 140 kB 68.0 MB/s 
[K     |████████████████████████████████| 212 kB 82.0 MB/s 
[K     |████████████████████████████████| 127 kB 71.6 MB/s 
[K     |████████████████████████████████| 94 kB 4.8 MB/s 
[K     |████████████████████████████████| 144 kB 68.4 MB/s 
[?25h  Building wheel for genomic-benchmarks (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is th

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
### Parameters
MODEL_NAME = "simecek/DNADebertaK6" #Original DNABert model
TOKENIZER_NAME = "armheb/DNA_bert_6"
K = 6
STRIDE = 1

# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 1

BENCHMARKS_FOLDER = '/root/.genomic_benchmarks'
# BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks/' (for INFRA HUB)

DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
 ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_nontata_promoters', 0)]

BATCH_SIZE = 16
ACCUMULATION = 4
LEARNING_RATE = 1e-5
EPOCHS = 4
RUNS = 5

# do not forget to attach drive
OUTPUT_PATH = 'drive/MyDrive/genomic_benchmarks/RandomizedDNADebertaK6_2.csv'

## Download benchmark datasets and tokenizer

In [None]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path
from tqdm.autonotebook import tqdm

for dataset_name, dataset_version in tqdm(DATASETS):
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version, use_cloud_cache=True)

benchmark_root = Path(BENCHMARKS_FOLDER)

  from tqdm.autonotebook import tqdm


  0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [None]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

# example
tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    

{'input_ids': [2, 501, 1989, 3848, 3089, 56, 212, 835, 3325, 999, 3983, 3629, 2214, 650, 2587, 2142, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [None]:
import pandas as pd
import numpy as np
from random import random, randrange
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
from random import random, randrange

def compute_metrics_binary(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics_multi(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    

    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })

    compute_metrics = compute_metrics_binary if len(labels) == 2 else compute_metrics_multi

    for _ in range(RUNS):

        model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
        model_cls.deberta.encoder.layer.apply(model_cls.deberta._init_weights)

        args = TrainingArguments('outputs', learning_rate=LEARNING_RATE, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
            evaluation_strategy="epoch", per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE*2, gradient_accumulation_steps=ACCUMULATION,
            num_train_epochs=EPOCHS, weight_decay=0.01, save_steps=100000, seed=randrange(1,10001), report_to='none')
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics)
        trainer.train()
        
        max_accuracy = max([x['eval_accuracy'] for x in trainer.state.log_history if 'eval_accuracy' in x])
        max_f1 = max([x['eval_f1'] for x in trainer.state.log_history if 'eval_f1' in x]) if len(labels) == 2 else np.nan
        train_runtime = max([x['train_runtime'] for x in trainer.state.log_history if 'train_runtime' in x])
        
        outputs.append((dataset_name, max_accuracy, max_f1, train_runtime))
        outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
        outputs_df.to_csv(OUTPUT_PATH, index=False)



  0%|          | 0/4 [00:00<?, ?it/s]



  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

Some weights of the model checkpoint at simecek/DNADebertaK6 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at simecek/DNAD

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2694,0.261492,0.89728,0.900179
2,0.2402,0.242145,0.9042,0.905175
3,0.2233,0.247445,0.90136,0.904337
4,0.2151,0.241322,0.9048,0.904678


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2763,0.25528,0.89884,0.900711
2,0.241,0.243247,0.90204,0.900248
3,0.2236,0.240696,0.90396,0.902939
4,0.2126,0.241188,0.90548,0.904997


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2762,0.259101,0.89428,0.89069
2,0.2373,0.288331,0.87976,0.869554
3,0.2259,0.24519,0.90224,0.904029
4,0.2192,0.241794,0.9036,0.903291


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2751,0.255502,0.89468,0.892447
2,0.2389,0.242942,0.9016,0.899959
3,0.2254,0.243689,0.90292,0.904234
4,0.2172,0.241947,0.90512,0.904922


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2771,0.254279,0.8982,0.898016
2,0.2424,0.249642,0.89756,0.89402
3,0.2274,0.243423,0.90468,0.905978
4,0.2217,0.241143,0.90444,0.904099


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1608,0.136629,0.94968,0.948774
2,0.1221,0.117235,0.95712,0.95713
3,0.1067,0.116209,0.9586,0.958475
4,0.1006,0.116913,0.95844,0.958458


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1613,0.163058,0.93912,0.941412
2,0.1254,0.120007,0.95576,0.95617
3,0.1061,0.115626,0.95776,0.957723
4,0.1022,0.116684,0.95812,0.958265


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1616,0.135457,0.95324,0.953268
2,0.1251,0.142871,0.94716,0.948757
3,0.1083,0.122712,0.95632,0.956766
4,0.1005,0.11655,0.95824,0.958273


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1629,0.156349,0.9426,0.940493
2,0.1252,0.118919,0.9568,0.957051
3,0.1103,0.11792,0.95728,0.957626
4,0.1014,0.116936,0.95816,0.958213


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1604,0.139474,0.9496,0.950577
2,0.1263,0.127946,0.95444,0.955089
3,0.1058,0.116367,0.95788,0.957784
4,0.1037,0.116644,0.95844,0.958432


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/27791 [00:00<?, ?ex/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.558781,0.708405,0.695063
1,0.592700,0.546538,0.723374,0.744415
2,0.592700,0.53929,0.734312,0.739192
3,0.526700,0.538531,0.734456,0.735635


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.562501,0.710996,0.687325
1,0.593500,0.555517,0.720207,0.685029
2,0.593500,0.537435,0.731865,0.730274
3,0.525800,0.539868,0.733017,0.730886


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.564551,0.707254,0.717186
1,0.596300,0.549857,0.718768,0.741876
2,0.596300,0.538589,0.730426,0.729609
3,0.525900,0.539909,0.731146,0.728014


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.565517,0.704807,0.678778
1,0.596600,0.544129,0.726108,0.738275
2,0.596600,0.539774,0.734024,0.726627
3,0.524300,0.538885,0.734744,0.734247


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.57046,0.698618,0.667513
1,0.596100,0.551533,0.719488,0.745661
2,0.596100,0.538059,0.734744,0.731849
3,0.525100,0.539233,0.734888,0.732501


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/36131 [00:00<?, ?ex/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.20

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.409833,0.82422,0.835201
1,0.480300,0.376825,0.838831,0.833676
2,0.358200,0.331821,0.861855,0.863517
3,0.319800,0.328455,0.862851,0.868009


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.449582,0.784924,0.816646
1,0.479400,0.355066,0.851671,0.857629
2,0.361200,0.331487,0.861745,0.868263
3,0.315000,0.329101,0.863294,0.870068


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.407506,0.823445,0.821048
1,0.476500,0.349218,0.854992,0.861229
2,0.356600,0.328708,0.861523,0.868496
3,0.313800,0.325985,0.864512,0.870476


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.399097,0.827872,0.830923
1,0.477700,0.354514,0.850454,0.853487
2,0.357800,0.349729,0.851229,0.863857
3,0.316200,0.330999,0.861966,0.868502


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/simecek/DNADebertaK6/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/290389faf61feca163161b441f1533d162d3196a609b88b234318767c64960f8.c7f87ccb839059561ef6996c9f8559644ce254107c992b74bd0b46fbe8c80d51
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.404153,0.831415,0.834403
1,0.482000,0.350406,0.850343,0.853458
2,0.361900,0.329773,0.860638,0.868264
3,0.316200,0.328968,0.862077,0.870019


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




## Outputs

In [None]:
outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
outputs_df

Unnamed: 0,dataset,accuracy,f1,train_runtime
0,demo_coding_vs_intergenomic_seqs,0.9048,0.905175,921.0052
1,demo_coding_vs_intergenomic_seqs,0.90548,0.904997,921.1745
2,demo_coding_vs_intergenomic_seqs,0.9036,0.904029,923.4556
3,demo_coding_vs_intergenomic_seqs,0.90512,0.904922,927.2819
4,demo_coding_vs_intergenomic_seqs,0.90468,0.905978,933.9786
5,demo_human_or_worm,0.9586,0.958475,940.3017
6,demo_human_or_worm,0.95812,0.958265,934.8047
7,demo_human_or_worm,0.95824,0.958273,937.6455
8,demo_human_or_worm,0.95816,0.958213,936.3577
9,demo_human_or_worm,0.95844,0.958432,937.9097


In [None]:
outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

Unnamed: 0_level_0,accuracy,accuracy,f1,f1,train_runtime,train_runtime
Unnamed: 0_level_1,mean,sem,mean,sem,mean,sem
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
demo_coding_vs_intergenomic_seqs,0.904736,0.000316,0.90502,0.000311,925.37916,2.429831
demo_human_or_worm,0.958312,9.1e-05,0.958332,5.1e-05,937.40386,0.909995
human_enhancers_cohn,0.73365,0.000709,0.740223,0.002654,703.68634,1.006746
human_nontata_promoters,0.86294,0.000464,0.869415,0.000486,341.54068,1.519215


In [None]:
# saving outputs to csv file
outputs_df.to_csv(OUTPUT_PATH, index=False)