In [1]:
#!pip install -qq transformers datasets

In [8]:
import os

os.environ['COMET_MODE'] = "DISABLED"
os.getenv("COMET_MODE", "").upper()

'DISABLED'

In [36]:
MODELS = ['simecek/HumanRedoneDNADeberta', 'simecek/MouseDNADeberta', 'simecek/ZebrafishDNADeberta', 'simecek/FruitflyDNADeberta',
          'simecek/WormDNADeberta', 'simecek/ArabidopsisDNADeberta', 'simecek/DNADebertaSmall']

DATASETS = ['simecek/Human_DNA_v0_DNABert6tokenized', 'davidcechak/Mouse_DNA_v0_DNABert6tokenized', 'davidcechak/Zebrafish_DNA_v0_DNABert6tokenized',  
            'davidcechak/Fruitfly_DNA_v0_DNABert6tokenized', 'davidcechak/Worm_DNA_v0_DNABert6tokenized', 'davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized']          

In [2]:
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [3]:
from transformers import TrainingArguments, Trainer

fake_training_args = TrainingArguments(
    output_dir='./nocommet',
    evaluation_strategy="steps",    
    overwrite_output_dir=True,   
    per_device_eval_batch_size=128,   
    max_steps=10,            
)

In [4]:
from transformers import AutoModelForMaskedLM

models = {}

for model_name in MODELS:
    models[model_name] = AutoModelForMaskedLM.from_pretrained(model_name)

In [32]:
from datasets import load_dataset
from tqdm.autonotebook import tqdm


results = []

for ds_name in tqdm(DATASETS):
    
    dataset = load_dataset(ds_name)
    
    for model_name in MODELS:

        model = models[model_name]
        #nparams = sum(p.numel() for p in model.parameters()) / 10**6
        
        trainer = Trainer(
            model=model,
            args=fake_training_args,
            data_collator=data_collator,
            #train_dataset=dataset['test'].select(range(100)),
            #eval_dataset=dataset['test'].select(range(100)),
            train_dataset=dataset['test'].select(range(min(10000, len(dataset['test'])))),
            eval_dataset=dataset['test'].select(range(min(10000, len(dataset['test'])))),
        )

        eval = trainer.evaluate()
        print(f"{model_name}:{ds_name} {eval['eval_loss']}")
        
        results.append((ds_name, model_name, eval['eval_loss']))
  


  0%|          | 0/6 [00:00<?, ?it/s]

Using custom data configuration simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/HumanRedoneDNADeberta:simecek/Human_DNA_v0_DNABert6tokenized 7.245747089385986


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/ZebrafishDNADeberta:simecek/Human_DNA_v0_DNABert6tokenized 7.334394454956055


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/ArabidopsisDNADeberta:simecek/Human_DNA_v0_DNABert6tokenized 7.345582008361816


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/FruitflyDNADeberta:simecek/Human_DNA_v0_DNABert6tokenized 7.384777545928955


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/MouseDNADeberta:simecek/Human_DNA_v0_DNABert6tokenized 7.302046298980713


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/WormDNADeberta:simecek/Human_DNA_v0_DNABert6tokenized 7.438027381896973


simecek/DNADebertaSmall:simecek/Human_DNA_v0_DNABert6tokenized 6.135972499847412


Using custom data configuration davidcechak--Zebrafish_DNA_v0_DNABert6tokenized-2bb7e7b03931a47a
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/davidcechak___parquet/davidcechak--Zebrafish_DNA_v0_DNABert6tokenized-2bb7e7b03931a47a/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/HumanRedoneDNADeberta:davidcechak/Zebrafish_DNA_v0_DNABert6tokenized 7.325279712677002


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/ZebrafishDNADeberta:davidcechak/Zebrafish_DNA_v0_DNABert6tokenized 7.250858783721924


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/ArabidopsisDNADeberta:davidcechak/Zebrafish_DNA_v0_DNABert6tokenized 7.284417152404785


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/FruitflyDNADeberta:davidcechak/Zebrafish_DNA_v0_DNABert6tokenized 7.268256187438965


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/MouseDNADeberta:davidcechak/Zebrafish_DNA_v0_DNABert6tokenized 7.3122076988220215


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/WormDNADeberta:davidcechak/Zebrafish_DNA_v0_DNABert6tokenized 7.363481044769287


simecek/DNADebertaSmall:davidcechak/Zebrafish_DNA_v0_DNABert6tokenized 7.106800556182861


Using custom data configuration davidcechak--Arabidopsis_thaliana_DNA_v0_DNABert6tokenized-5ee4cfd33a3fa24b
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/davidcechak___parquet/davidcechak--Arabidopsis_thaliana_DNA_v0_DNABert6tokenized-5ee4cfd33a3fa24b/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3879
  Batch size = 128


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3879
  Batch size = 128


simecek/HumanRedoneDNADeberta:davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized 7.402441501617432


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3879
  Batch size = 128


simecek/ZebrafishDNADeberta:davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized 7.376944541931152


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3879
  Batch size = 128


simecek/ArabidopsisDNADeberta:davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized 7.2427191734313965


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3879
  Batch size = 128


simecek/FruitflyDNADeberta:davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized 7.319701194763184


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3879
  Batch size = 128


simecek/MouseDNADeberta:davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized 7.394493579864502


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3879
  Batch size = 128


simecek/WormDNADeberta:davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized 7.360411167144775


simecek/DNADebertaSmall:davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized 7.3131208419799805


Using custom data configuration davidcechak--Mouse_DNA_v0_DNABert6tokenized-3b2b4a4e5d904503
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/davidcechak___parquet/davidcechak--Mouse_DNA_v0_DNABert6tokenized-3b2b4a4e5d904503/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/HumanRedoneDNADeberta:davidcechak/Mouse_DNA_v0_DNABert6tokenized 7.309451580047607


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/ZebrafishDNADeberta:davidcechak/Mouse_DNA_v0_DNABert6tokenized 7.339879512786865


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/ArabidopsisDNADeberta:davidcechak/Mouse_DNA_v0_DNABert6tokenized 7.350376605987549


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/FruitflyDNADeberta:davidcechak/Mouse_DNA_v0_DNABert6tokenized 7.398280620574951


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/MouseDNADeberta:davidcechak/Mouse_DNA_v0_DNABert6tokenized 7.276564121246338


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 128


simecek/WormDNADeberta:davidcechak/Mouse_DNA_v0_DNABert6tokenized 7.462935924530029


simecek/DNADebertaSmall:davidcechak/Mouse_DNA_v0_DNABert6tokenized 6.9782795906066895


Using custom data configuration davidcechak--Worm_DNA_v0_DNABert6tokenized-e558388919872e43
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/davidcechak___parquet/davidcechak--Worm_DNA_v0_DNABert6tokenized-e558388919872e43/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3266
  Batch size = 128


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3266
  Batch size = 128


simecek/HumanRedoneDNADeberta:davidcechak/Worm_DNA_v0_DNABert6tokenized 7.359205722808838


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3266
  Batch size = 128


simecek/ZebrafishDNADeberta:davidcechak/Worm_DNA_v0_DNABert6tokenized 7.320830345153809


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3266
  Batch size = 128


simecek/ArabidopsisDNADeberta:davidcechak/Worm_DNA_v0_DNABert6tokenized 7.270549774169922


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3266
  Batch size = 128


simecek/FruitflyDNADeberta:davidcechak/Worm_DNA_v0_DNABert6tokenized 7.240468978881836


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3266
  Batch size = 128


simecek/MouseDNADeberta:davidcechak/Worm_DNA_v0_DNABert6tokenized 7.353073596954346


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 3266
  Batch size = 128


simecek/WormDNADeberta:davidcechak/Worm_DNA_v0_DNABert6tokenized 7.2142767906188965


simecek/DNADebertaSmall:davidcechak/Worm_DNA_v0_DNABert6tokenized 7.292906761169434


Using custom data configuration davidcechak--Fruitfly_DNA_v0_DNABert6tokenized-edd94a8581cd97bd
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/davidcechak___parquet/davidcechak--Fruitfly_DNA_v0_DNABert6tokenized-edd94a8581cd97bd/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 128


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 128


simecek/HumanRedoneDNADeberta:davidcechak/Fruitfly_DNA_v0_DNABert6tokenized 7.542044162750244


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 128


simecek/ZebrafishDNADeberta:davidcechak/Fruitfly_DNA_v0_DNABert6tokenized 7.4960832595825195


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 128


simecek/ArabidopsisDNADeberta:davidcechak/Fruitfly_DNA_v0_DNABert6tokenized 7.467253684997559


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 128


simecek/FruitflyDNADeberta:davidcechak/Fruitfly_DNA_v0_DNABert6tokenized 7.336008071899414


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 128


simecek/MouseDNADeberta:davidcechak/Fruitfly_DNA_v0_DNABert6tokenized 7.533913612365723


max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 128


simecek/WormDNADeberta:davidcechak/Fruitfly_DNA_v0_DNABert6tokenized 7.504519939422607


simecek/DNADebertaSmall:davidcechak/Fruitfly_DNA_v0_DNABert6tokenized 7.413302898406982


In [33]:
import pandas as pd

results_long = pd.DataFrame.from_records(results, columns=["dataset", "model", "loss"])
results_long

Unnamed: 0,dataset,model,loss
0,simecek/Human_DNA_v0_DNABert6tokenized,simecek/HumanRedoneDNADeberta,7.245747
1,simecek/Human_DNA_v0_DNABert6tokenized,simecek/ZebrafishDNADeberta,7.334394
2,simecek/Human_DNA_v0_DNABert6tokenized,simecek/ArabidopsisDNADeberta,7.345582
3,simecek/Human_DNA_v0_DNABert6tokenized,simecek/FruitflyDNADeberta,7.384778
4,simecek/Human_DNA_v0_DNABert6tokenized,simecek/MouseDNADeberta,7.302046
5,simecek/Human_DNA_v0_DNABert6tokenized,simecek/WormDNADeberta,7.438027
6,simecek/Human_DNA_v0_DNABert6tokenized,simecek/DNADebertaSmall,6.135972
7,davidcechak/Zebrafish_DNA_v0_DNABert6tokenized,simecek/HumanRedoneDNADeberta,7.32528
8,davidcechak/Zebrafish_DNA_v0_DNABert6tokenized,simecek/ZebrafishDNADeberta,7.250859
9,davidcechak/Zebrafish_DNA_v0_DNABert6tokenized,simecek/ArabidopsisDNADeberta,7.284417


In [34]:
results_pivot = results_long.pivot(index='dataset', columns='model', values='loss')
results_pivot

model,simecek/ArabidopsisDNADeberta,simecek/DNADebertaSmall,simecek/FruitflyDNADeberta,simecek/HumanRedoneDNADeberta,simecek/MouseDNADeberta,simecek/WormDNADeberta,simecek/ZebrafishDNADeberta
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized,7.242719,7.313121,7.319701,7.402442,7.394494,7.360411,7.376945
davidcechak/Fruitfly_DNA_v0_DNABert6tokenized,7.467254,7.413303,7.336008,7.542044,7.533914,7.50452,7.496083
davidcechak/Mouse_DNA_v0_DNABert6tokenized,7.350377,6.97828,7.398281,7.309452,7.276564,7.462936,7.33988
davidcechak/Worm_DNA_v0_DNABert6tokenized,7.27055,7.292907,7.240469,7.359206,7.353074,7.214277,7.32083
davidcechak/Zebrafish_DNA_v0_DNABert6tokenized,7.284417,7.106801,7.268256,7.32528,7.312208,7.363481,7.250859
simecek/Human_DNA_v0_DNABert6tokenized,7.345582,6.135972,7.384778,7.245747,7.302046,7.438027,7.334394


In [37]:
results_output = results_pivot.reindex(DATASETS)[MODELS]
results_output.round(2)

model,simecek/HumanRedoneDNADeberta,simecek/MouseDNADeberta,simecek/ZebrafishDNADeberta,simecek/FruitflyDNADeberta,simecek/WormDNADeberta,simecek/ArabidopsisDNADeberta,simecek/DNADebertaSmall
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
simecek/Human_DNA_v0_DNABert6tokenized,7.25,7.3,7.33,7.38,7.44,7.35,6.14
davidcechak/Mouse_DNA_v0_DNABert6tokenized,7.31,7.28,7.34,7.4,7.46,7.35,6.98
davidcechak/Zebrafish_DNA_v0_DNABert6tokenized,7.33,7.31,7.25,7.27,7.36,7.28,7.11
davidcechak/Fruitfly_DNA_v0_DNABert6tokenized,7.54,7.53,7.5,7.34,7.5,7.47,7.41
davidcechak/Worm_DNA_v0_DNABert6tokenized,7.36,7.35,7.32,7.24,7.21,7.27,7.29
davidcechak/Arabidopsis_thaliana_DNA_v0_DNABert6tokenized,7.4,7.39,7.38,7.32,7.36,7.24,7.31


In [27]:
results_output.to_csv("loss_per_organism.csv")