In [1]:
!pip install -qq transformers genomic-benchmarks datasets

[K     |████████████████████████████████| 4.4 MB 15.2 MB/s 
[K     |████████████████████████████████| 362 kB 58.4 MB/s 
[K     |████████████████████████████████| 101 kB 13.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 51.2 MB/s 
[K     |████████████████████████████████| 596 kB 46.6 MB/s 
[K     |████████████████████████████████| 2.3 MB 44.6 MB/s 
[K     |████████████████████████████████| 271 kB 15.3 MB/s 
[K     |████████████████████████████████| 140 kB 49.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 48.2 MB/s 
[K     |████████████████████████████████| 212 kB 32.2 MB/s 
[K     |████████████████████████████████| 127 kB 48.4 MB/s 
[K     |████████████████████████████████| 94 kB 1.0 MB/s 
[K     |████████████████████████████████| 144 kB 43.9 MB/s 
[?25h  Building wheel for genomic-benchmarks (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is t

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
### Parameters
MODEL_NAME = "davidcechak/DNADebertaK8" #Original DNABert model
TOKENIZER_NAME = "armheb/DNA_bert_6"
K = 8
STRIDE = 1

# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 1

BENCHMARKS_FOLDER = '/root/.genomic_benchmarks'
# BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks/' (for INFRA HUB)

DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
 ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_nontata_promoters', 0)]

BATCH_SIZE = 4
ACCUMULATION = 16

LEARNING_RATE = 1e-5
EPOCHS = 4
RUNS = 5

# do not forget to attach drive
OUTPUT_PATH = 'drive/MyDrive/genomic_benchmarks/RandomizedDNADebertaK8_2.csv'

## Download benchmark datasets and tokenizer

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [3]:
from itertools import product

alphabet = ('A', 'C', 'T', 'G')
vocab = list(map(''.join, product(alphabet, repeat=K)))

tokenizer.add_tokens(vocab)

65536

In [4]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path
from tqdm.autonotebook import tqdm

for dataset_name, dataset_version in tqdm(DATASETS):
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version, use_cloud_cache=True)

benchmark_root = Path(BENCHMARKS_FOLDER)

  0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])))

# example
tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    

{'input_ids': [2, 16136, 52241, 65592, 53460, 4930, 7417, 17366, 57162, 19737, 66647, 57679, 21806, 9387, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [6]:
import pandas as pd
import numpy as np
from random import random, randrange
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric

def compute_metrics_binary(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def compute_metrics_multi(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    

    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })

    compute_metrics = compute_metrics_binary if len(labels) == 2 else compute_metrics_multi

    for _ in range(RUNS):

        model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
        model_cls.deberta.encoder.layer.apply(model_cls.deberta._init_weights)

        args = TrainingArguments('outputs', learning_rate=LEARNING_RATE, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
            evaluation_strategy="epoch", per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE*2, gradient_accumulation_steps=ACCUMULATION,
            num_train_epochs=EPOCHS, weight_decay=0.01, save_steps=100000, seed=randrange(1,10001), report_to='none')
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics)
        trainer.train()
        
        max_accuracy = max([x['eval_accuracy'] for x in trainer.state.log_history if 'eval_accuracy' in x])
        max_f1 = max([x['eval_f1'] for x in trainer.state.log_history if 'eval_f1' in x]) if len(labels) == 2 else np.nan
        train_runtime = max([x['train_runtime'] for x in trainer.state.log_history if 'train_runtime' in x])
        
        outputs.append((dataset_name, max_accuracy, max_f1, train_runtime))
        outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
        outputs_df.to_csv(OUTPUT_PATH, index=False)




  0%|          | 0/4 [00:00<?, ?it/s]



  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

Some weights of the model checkpoint at davidcechak/DNADebertaK8 were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at davidcec

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2541,0.236402,0.90736,0.905136
1,0.1934,0.231773,0.91244,0.912212
2,0.1372,0.257816,0.91268,0.912593
3,0.1072,0.285791,0.91248,0.91269


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2603,0.233404,0.9066,0.904783
1,0.1957,0.236802,0.91284,0.914317
2,0.139,0.242161,0.9128,0.912981
3,0.1091,0.287509,0.91288,0.912255


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2585,0.245665,0.90256,0.898475
1,0.1958,0.228429,0.91204,0.912762
2,0.1351,0.266755,0.91044,0.90872
3,0.1086,0.283734,0.91208,0.911805


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2574,0.268384,0.89092,0.897608
1,0.195,0.238305,0.9088,0.905972
2,0.1373,0.256937,0.9124,0.91324
3,0.1076,0.281612,0.91096,0.910531


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.2597,0.237007,0.90844,0.907114
1,0.1923,0.245763,0.90392,0.899632
2,0.1389,0.250763,0.91152,0.9112
3,0.1077,0.286657,0.91196,0.911189


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1512,0.1201,0.95564,0.955212
1,0.0921,0.113416,0.96008,0.959713
2,0.0505,0.149844,0.9584,0.958919
3,0.0319,0.161832,0.95884,0.959136


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.147,0.12556,0.95384,0.954538
1,0.0916,0.11033,0.96028,0.960259
2,0.0486,0.140958,0.95904,0.958819
3,0.033,0.164166,0.95852,0.958785


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1507,0.131832,0.95352,0.95249
1,0.0914,0.109422,0.96028,0.960167
2,0.0517,0.145973,0.95936,0.95889
3,0.0319,0.162682,0.96036,0.960535


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1471,0.129434,0.95312,0.952007
1,0.0937,0.123094,0.95832,0.958716
2,0.0534,0.136228,0.9608,0.9608
3,0.0312,0.158834,0.95992,0.960169


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embed

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.1493,0.127767,0.95332,0.954147
1,0.0896,0.112777,0.96052,0.9606
2,0.0513,0.133843,0.9604,0.960526
3,0.0299,0.163672,0.96,0.960245


***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/27791 [00:00<?, ?ex/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.55096,0.719488,0.737861
1,0.580900,0.538112,0.732585,0.743653
2,0.580900,0.560359,0.729562,0.741078
3,0.463100,0.569668,0.73532,0.736041


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.557647,0.711428,0.741724
1,0.581000,0.573192,0.705815,0.745581
2,0.581000,0.564306,0.733736,0.719357
3,0.463000,0.567453,0.730858,0.73736


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.546072,0.723374,0.737073
1,0.579000,0.537366,0.730714,0.747333
2,0.579000,0.554804,0.738198,0.73183
3,0.463100,0.566411,0.733736,0.737216


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.544783,0.720063,0.714516
1,0.581500,0.5427,0.736903,0.717988
2,0.581500,0.557733,0.737334,0.729509
3,0.460800,0.564861,0.733448,0.734594


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.551704,0.713155,0.703467
1,0.580700,0.534085,0.736615,0.736918
2,0.580700,0.559062,0.733161,0.746514
3,0.460900,0.570095,0.739062,0.739099


***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8
***** Running Evaluation *****
  Num examples = 6948
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/36131 [00:00<?, ?ex/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

  0%|          | 0/37 [00:00<?, ?ba/s]

loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.334632,0.863294,0.864894
1,0.445400,0.169543,0.940669,0.944038
2,0.160400,0.145507,0.955391,0.958705
3,0.066800,0.152084,0.956055,0.958745


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.346993,0.858867,0.871303
1,0.445700,0.171803,0.938787,0.943508
2,0.172100,0.151507,0.954505,0.957651
3,0.067700,0.149536,0.95694,0.96


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.347086,0.858756,0.870641
1,0.441200,0.171617,0.940004,0.94402
2,0.166700,0.22126,0.935355,0.937366
3,0.063000,0.151286,0.95528,0.958204


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.326793,0.866394,0.869415
1,0.442400,0.190418,0.936019,0.939375
2,0.163600,0.156666,0.953952,0.957263
3,0.068200,0.159728,0.955391,0.958535


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


loading configuration file https://huggingface.co/davidcechak/DNADebertaK8/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/808901853890ddb7e1ac8c4ecf5112272702596ee51b69cdf389c63da11fe2e9.b628e4d5e7f344695b21fd7ca542aa0f8188268bc6c6b3380dc1e19a16c2447e
Model config DebertaConfig {
  "_name_or_path": "davidcechak/DNADebertaK8",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embedding

Epoch,Training Loss,Validation Loss,Accuracy,F1
0,No log,0.340525,0.860084,0.859337
1,0.442400,0.181943,0.937791,0.941263
2,0.173900,0.14962,0.953398,0.95662
3,0.071100,0.151546,0.955612,0.958604


***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8
***** Running Evaluation *****
  Num examples = 9034
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




## Outputs

In [7]:
outputs_df = pd.DataFrame(outputs, columns = ['dataset', 'accuracy', 'f1', 'train_runtime'])
outputs_df

Unnamed: 0,dataset,accuracy,f1,train_runtime
0,demo_coding_vs_intergenomic_seqs,0.91268,0.91269,3085.3888
1,demo_coding_vs_intergenomic_seqs,0.91288,0.914317,3118.3607
2,demo_coding_vs_intergenomic_seqs,0.91208,0.912762,3127.8818
3,demo_coding_vs_intergenomic_seqs,0.9124,0.91324,3124.0516
4,demo_coding_vs_intergenomic_seqs,0.91196,0.9112,3146.3179
5,demo_human_or_worm,0.96008,0.959713,3151.0799
6,demo_human_or_worm,0.96028,0.960259,3125.1223
7,demo_human_or_worm,0.96036,0.960535,3112.1037
8,demo_human_or_worm,0.9608,0.9608,3131.7306
9,demo_human_or_worm,0.96052,0.9606,3170.2935


In [8]:
outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

Unnamed: 0_level_0,accuracy,accuracy,f1,f1,train_runtime,train_runtime
Unnamed: 0_level_1,mean,sem,mean,sem,mean,sem
dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
demo_coding_vs_intergenomic_seqs,0.9124,0.000174,0.912842,0.000503,3120.40016,9.927215
demo_human_or_worm,0.960408,0.000121,0.960382,0.000188,3138.066,10.221769
human_enhancers_cohn,0.73673,0.000972,0.743535,0.002318,1112.40848,0.476513
human_nontata_promoters,0.955856,0.000302,0.958818,0.000309,1162.10908,2.787958


In [9]:
# saving outputs to csv file
outputs_df.to_csv(OUTPUT_PATH, index=False)