In [1]:
#!pip install -qq transformers datasets

In [1]:
import os

os.environ['COMET_MODE'] = "DISABLED"
os.getenv("COMET_MODE", "").upper()

'DISABLED'

In [2]:
MODELS = ['simecek/humandna_DEBERTASMALL_1epoch', 'simecek/humandna_MOBILEBERT_1epoch', 'simecek/humandna_ELECTRA_1epoch',
          'simecek/humandna_PERCEIVER_1epoch', 'simecek/humandna_ALBERT_1epoch', 'simecek/humandna_BERT_1epoch', 'simecek/humandna_DEBERTA_1epoch', 'simecek/humandna_DISTILBERT_1epoch', 
          'Vlasta/humandna_bert_default_beautiful_bench_4197', 'Vlasta/humandna_distillbert_default_dual_liability_4383', 'Vlasta/humandna_deberta_default_empty_stud_8442',
          'Vlasta/humandna_Electra_random', 'Vlasta/humandna_DISTILBERT_random', 'Vlasta/humandna_DEBERTAsmall_random'
         ]

In [3]:
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [4]:
from transformers import TrainingArguments, Trainer

fake_training_args = TrainingArguments(
    output_dir='./nocommet',
    evaluation_strategy="steps",    
    overwrite_output_dir=True,   
    per_device_eval_batch_size=32,   
    max_steps=10,            
)

In [5]:
from transformers import AutoModelForMaskedLM

models = {}

for model_name in MODELS:
    print(model_name)
    models[model_name] = AutoModelForMaskedLM.from_pretrained(model_name)

simecek/humandna_DEBERTASMALL_1epoch
simecek/humandna_MOBILEBERT_1epoch
simecek/humandna_ELECTRA_1epoch
simecek/humandna_PERCEIVER_1epoch
simecek/humandna_ALBERT_1epoch
simecek/humandna_BERT_1epoch
simecek/humandna_DEBERTA_1epoch
simecek/humandna_DISTILBERT_1epoch
Vlasta/humandna_bert_default_beautiful_bench_4197
Vlasta/humandna_distillbert_default_dual_liability_4383
Vlasta/humandna_deberta_default_empty_stud_8442
Vlasta/humandna_Electra_random
Vlasta/humandna_DISTILBERT_random
Vlasta/humandna_DEBERTAsmall_random


Downloading:   0%|          | 0.00/178M [00:00<?, ?B/s]

In [6]:
from datasets import load_dataset

dataset = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized")

Using custom data configuration simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from tqdm.autonotebook import tqdm

results = []
    
for model_name in tqdm(MODELS):
        model = models[model_name]
        nparams = sum(p.numel() for p in model.parameters()) / 10**6
        
        trainer = Trainer(
            model=model,
            args=fake_training_args,
            data_collator=data_collator,
            train_dataset=dataset['test'].select(range(10000)),
            eval_dataset=dataset['test'].select(range(10000)),
            #train_dataset=dataset['test'].select(range(min(10000, len(dataset['test'])))),
            #eval_dataset=dataset['test'].select(range(min(10000, len(dataset['test'])))),
        )

        eval = trainer.evaluate()
        print(f"{model_name}: {eval['eval_loss']}")
        
        results.append((model_name, nparams, eval['eval_runtime'], eval['eval_loss']))
  


  0%|          | 0/14 [00:00<?, ?it/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


simecek/humandna_DEBERTASMALL_1epoch: 7.263691425323486


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


simecek/humandna_MOBILEBERT_1epoch: 6.975131988525391


simecek/humandna_ELECTRA_1epoch: 7.315179824829102


max_steps is given, it will override any value given in num_train_epochs
The following columns in the evaluation set don't have a corresponding argument in `PerceiverForMaskedLM.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PerceiverForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


simecek/humandna_PERCEIVER_1epoch: 7.815895080566406


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


simecek/humandna_ALBERT_1epoch: 8.014447212219238


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


simecek/humandna_BERT_1epoch: 7.266287326812744


max_steps is given, it will override any value given in num_train_epochs
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


simecek/humandna_DEBERTA_1epoch: 7.263495922088623


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


simecek/humandna_DISTILBERT_1epoch: 7.263185977935791


max_steps is given, it will override any value given in num_train_epochs
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


Vlasta/humandna_bert_default_beautiful_bench_4197: 7.263117790222168


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


Vlasta/humandna_distillbert_default_dual_liability_4383: 7.261172771453857


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


Vlasta/humandna_deberta_default_empty_stud_8442: 7.262007713317871


max_steps is given, it will override any value given in num_train_epochs
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


Vlasta/humandna_Electra_random: 7.3161139488220215


max_steps is given, it will override any value given in num_train_epochs
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32


Vlasta/humandna_DISTILBERT_random: 7.242573261260986


Vlasta/humandna_DEBERTAsmall_random: 7.242573261260986


In [9]:
import pandas as pd

results_long = pd.DataFrame.from_records(results, columns=["model", "nparams", "time", "loss"])
results_long.sort_values("loss").round(2)

Unnamed: 0,model,nparams,time,loss
1,simecek/humandna_MOBILEBERT_1epoch,23.04,46.74,6.98
12,Vlasta/humandna_DISTILBERT_random,46.67,29.75,7.24
13,Vlasta/humandna_DEBERTAsmall_random,46.67,29.73,7.24
9,Vlasta/humandna_distillbert_default_dual_liabi...,46.67,29.75,7.26
10,Vlasta/humandna_deberta_default_empty_stud_8442,89.19,70.46,7.26
8,Vlasta/humandna_bert_default_beautiful_bench_4197,89.2,56.75,7.26
7,simecek/humandna_DISTILBERT_1epoch,46.67,29.74,7.26
6,simecek/humandna_DEBERTA_1epoch,89.19,70.45,7.26
0,simecek/humandna_DEBERTASMALL_1epoch,46.66,37.04,7.26
5,simecek/humandna_BERT_1epoch,89.2,56.74,7.27


In [10]:
results_long.sort_values("loss").to_csv("loss_per_architecture.csv")