In [1]:
import comet_ml
from torch import cuda
cuda.is_available()

True

In [2]:
import os

os.environ['COMET_API_KEY'] = "EpKIINrla6U4B4LJhd9Sv4i0b"

# Commet Init
comet_ml.init(project_name="dna-finetuning-1512", api_key= "EpKIINrla6U4B4LJhd9Sv4i0b")

COMET INFO: Comet API key is valid
COMET INFO: Comet API key saved in /home/jovyan/.comet.config


In [3]:
# export COMET_API_KEY=EpKIINrla6U4B4LJhd9Sv4i0b

In [4]:
### Parameters
RANDOMIZE_WEIGHTS = False 
# OUTPUT_PATH = './DNADebertaK6c_metrics.csv'
OUTPUT_PATH = './DNADebertaK6_Fruitfly_metrics_100epochs_earlystop.csv'

MODEL_NAME = "simecek/DNADebertaK6_Fruitfly"
TOKENIZER_NAME = "armheb/DNA_bert_6"
K = 6
STRIDE = 1


# MODEL_NAME = "Vlasta/DNADebertaSentencepiece30k"
# TOKENIZER_NAME = "Vlasta/DNADebertaSentencepiece30k"
# K = None
# STRIDE = None



# All datasets
# DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
#  ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0),
#  ('human_ensembl_regulatory', 0), ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0)]

# Quick check dataset
# DATASETS = [('demo_human_or_worm', 0)]


# Binary classification datasets (without human_ensembl_regulatory)
DATASETS = [('demo_coding_vs_intergenomic_seqs', 0),
 ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0),
  ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0), ('drosophila_enhancers_stark', 0)]
# DATASETS = [('human_enhancers_cohn', 0), ('human_nontata_promoters', 0)]


# if ensemble refuses connection - "[Errno 104] Connection reset by peer", use attribute use_cloud_cache=True
BENCHMARKS_FOLDER = '/home/jovyan/.genomic_benchmarks'
USE_CLOUD_CACHE = True
# if less than 1, only this fraction of each dataset is used
DATASET_THINING = 1 

BATCH_SIZE = 32
ACCUMULATION = 2
LEARNING_RATE = 1e-5
EPOCHS = 100
RUNS = 5

print(DATASETS)

[('demo_coding_vs_intergenomic_seqs', 0), ('demo_human_or_worm', 0), ('human_enhancers_cohn', 0), ('human_enhancers_ensembl', 0), ('human_nontata_promoters', 0), ('human_ocr_ensembl', 0), ('drosophila_enhancers_stark', 0)]


In [5]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback
from transformers.integrations import CometCallback
warmup_ratio = 0.05 #5 epochs (for 100 epochs total train)
if(RANDOMIZE_WEIGHTS):
    warmup_ratio = 0
def get_trainargs():
    return TrainingArguments(
        'outputs', 
        learning_rate=LEARNING_RATE, 
        warmup_ratio=warmup_ratio, 
        lr_scheduler_type='linear',
        fp16=True,
        evaluation_strategy="epoch", 
        per_device_train_batch_size=BATCH_SIZE, 
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=ACCUMULATION,
        num_train_epochs=EPOCHS, 
        weight_decay=0.01,
        save_strategy='epoch',
        # save_strategy='no',
        seed=randrange(1,10001), 
        report_to='none',
        load_best_model_at_end=True,
    )
#early stopping 5 epochs
callbacks= [
    EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.0),
    CometCallback()
]

In [6]:
from itertools import product
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
if(K is not None and K>6):
    alphabet = ('A', 'C', 'T', 'G')
    vocab = list(map(''.join, product(alphabet, repeat=K)))
    tokenizer.add_tokens(vocab)

In [7]:
def kmers_strideK(s, k=K):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

def kmers_stride1(s, k=K):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

if (STRIDE == 1):
  kmers = kmers_stride1
else:
  kmers = kmers_strideK

# function used for the actual tokenization
if(K is not None):
    def tok_func(x): return tokenizer(" ".join(kmers(x["seq"])), truncation=True)
else:
    def tok_func(x): return tokenizer(x["seq"], truncation=True)

# example
example = tok_func({'seq': 'ATGGAAAGAGGCACCATTCT'})    
print(example)
tokenizer.decode(example['input_ids'])

{'input_ids': [2, 501, 1989, 3848, 3089, 56, 212, 835, 3325, 999, 3983, 3629, 2214, 650, 2587, 2142, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


'[CLS] ATGGAA TGGAAA GGAAAG GAAAGA AAAGAG AAGAGG AGAGGC GAGGCA AGGCAC GGCACC GCACCA CACCAT ACCATT CCATTC CATTCT [SEP]'

## Download benchmark datasets and tokenizer

In [8]:
from genomic_benchmarks.loc2seq import download_dataset
from genomic_benchmarks.data_check.info import is_downloaded
from pathlib import Path
from tqdm.autonotebook import tqdm

for dataset_name, dataset_version in tqdm(DATASETS):
    if not is_downloaded(dataset_name):
        download_dataset(dataset_name, version=dataset_version, use_cloud_cache=USE_CLOUD_CACHE)

benchmark_root = Path(BENCHMARKS_FOLDER)

  0%|          | 0/7 [00:00<?, ?it/s]

## Function to extract dataframe metrics row from training logs

In [9]:
def get_log_from_history(history, dataset_name):
    eval_dicts = [x for x in history if 'eval_loss' in x]
    test_dicts = [x for x in history if 'test_loss' in x]
    test_log = test_dicts[0]
    test_acc = test_log['test_accuracy']
    test_f1 = test_log['test_f1']
    test_loss = test_log['test_loss']
    test_precision = test_log['test_precision']
    test_recall = test_log['test_recall']
    test_auroc_macro = test_log['test_rocauc_0_roc_auc']
    test_auroc_weighted = test_log['test_rocauc_1_roc_auc']
    test_pr_auc = test_log['test_pr_auc']
    
    
    min_loss_dict = min(eval_dicts, key=lambda x: x['eval_loss'])
    min_loss_epoch = min_loss_dict['epoch']
    # max_f1_dict = max(eval_dicts, key=lambda x: x['eval_f1'])
    # max_acc_dict = max(eval_dicts, key=lambda x: x['eval_accuracy'])
    row = {
        'dataset':dataset_name,
        'test_acc':test_acc,
        'test_f1':test_f1,
        'test_loss':test_loss,
        'test_precision':test_precision,
        'test_recall':test_recall,
        'test_auroc_macro':test_auroc_macro,
        'test_auroc_weighted':test_auroc_weighted,
        'test_pr_auc':test_pr_auc,
        
        'min_valid_loss_epoch':min_loss_epoch,
        'min_valid_loss_log':min_loss_dict,
        # 'max_valid_f1_log':max_f1_dict,
        # 'max_valid_acc_log':max_acc_dict,
    }
    return row

## Looping through datasets, fine-tuning the model for each of them, logging metrics

In [10]:
import evaluate
binary_metrics = evaluate.combine([
    'accuracy',
    'f1',
    'recall',
    'precision',
    #Order of roc_auc matters for logging -> macro first, then weighted
    evaluate.load('roc_auc', average='macro'),
    evaluate.load('roc_auc', average='weighted'),
    evaluate.load("Vlasta/pr_auc"),
])
# binary_metrics.compute(references=[0,1,1,1], predictions=[0,0,1,1], prediction_scores=[0.4,0.3,0.6,0.9])


In [None]:
import pandas as pd
import numpy as np
from random import random, randrange
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
import torch

def compute_metrics_binary(eval_preds):
    logits, labels = eval_preds
    prediction_scores = torch.nn.functional.softmax(
        torch.from_numpy(logits).double(), dim=-1).numpy() 
    # predictions = np.argmax(logits, axis=-1) #equivalent
    predictions = np.argmax(prediction_scores, axis=-1)
    return binary_metrics.compute(
        predictions=predictions, 
        references=labels, 
        prediction_scores=prediction_scores[:,1] #taking only prediction percentage for the label 1
    )
    
#TODO human_ensembl_regulatory dataset multilabel metrics
def compute_metrics_multi(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

outputs = []

for dataset_name, dataset_version in tqdm(DATASETS):
    print(dataset_name)
    labels = sorted([x.stem for x in (benchmark_root / dataset_name / 'train').iterdir()])

    tmp_dict = {}

    for split in ['train', 'test']:
        for nlabel, label in enumerate(labels):
            for f in (benchmark_root / dataset_name / split / label).glob('*.txt'):
                txt = f.read_text()
                if not DATASET_THINING or DATASET_THINING==1:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)
                elif random() < DATASET_THINING:
                    tmp_dict[f"{label} {f.stem}"] = (split, nlabel, txt)

    df = pd.DataFrame.from_dict(tmp_dict).T.rename(columns = {0: "dset", 1: "cat", 2: "seq"})

    ds = Dataset.from_pandas(df)

    tok_ds = ds.map(tok_func, batched=False, remove_columns=['__index_level_0__', 'seq'])
    tok_ds = tok_ds.rename_columns({'cat':'labels'})

    dds = DatasetDict({
        'train': tok_ds.filter(lambda x: x["dset"] == "train").remove_columns('dset'),
        'test':  tok_ds.filter(lambda x: x["dset"] == "test").remove_columns('dset')
    })
    train_valid_split = dds['train'].train_test_split(test_size=0.2, shuffle=True, seed=42)
    dds['train']=train_valid_split['train']
    dds['valid']=train_valid_split['test']

    compute_metrics = compute_metrics_binary if len(labels) == 2 else compute_metrics_multi

    for _ in range(RUNS):
        model_cls = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
        if(RANDOMIZE_WEIGHTS):
            # model_cls.init_weights() #Alternative
            model_cls = AutoModelForSequenceClassification.from_config(model_cls.config)            
            
        args = get_trainargs()
        
        trainer = Trainer(model_cls, args, train_dataset=dds['train'], eval_dataset=dds['valid'],
                          tokenizer=tokenizer, compute_metrics=compute_metrics, 
                          callbacks=callbacks)
        trainer.train()
        trainer.evaluate(dds['test'], metric_key_prefix='test')
        training_log = get_log_from_history(trainer.state.log_history, dataset_name=dataset_name)
        outputs.append(training_log)
  

  0%|          | 0/7 [00:00<?, ?it/s]

demo_coding_vs_intergenomic_seqs


  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

Some weights of the model checkpoint at simecek/DNADebertaK6_Fruitfly were not used when initializing DebertaForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at sim

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6175,0.330901,0.868733,0.862912,0.826377,0.902826,0.945712,0.945712,0.941956
1,0.2986,0.264078,0.898067,0.897568,0.893319,0.901858,0.960293,0.960293,0.957763
2,0.2492,0.242536,0.9052,0.90401,0.892919,0.915379,0.966571,0.966571,0.964387
3,0.2313,0.244624,0.907733,0.905088,0.879984,0.931667,0.969432,0.969432,0.967455
4,0.2113,0.233966,0.9114,0.909327,0.888652,0.930986,0.971113,0.971113,0.969263
5,0.1993,0.221742,0.912933,0.912184,0.904521,0.919978,0.972029,0.972029,0.970105
6,0.1768,0.225422,0.917667,0.918055,0.922523,0.913629,0.973491,0.973491,0.971877
7,0.1668,0.264795,0.904467,0.899361,0.853847,0.95,0.973135,0.973135,0.971662
8,0.1393,0.263028,0.914,0.911813,0.889319,0.935475,0.972986,0.972986,0.972204
9,0.1247,0.242702,0.916133,0.914538,0.897586,0.932142,0.972365,0.972365,0.971432


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


COMET INFO: CUDA_VISIBLE_DEVICES is unset, defaulting to all devices
COMET INFO: devices to report: ['GPU-ba82732c-f75d-0cd8-1d2f-4b5ecd1b399a']
COMET INFO: CUDA_VISIBLE_DEVICES is unset, defaulting to all devices
COMET INFO: devices to report: ['GPU-ba82732c-f75d-0cd8-1d2f-4b5ecd1b399a']
COMET INFO: CUDA_VISIBLE_DEVICES is unset, defaulting to all devices
COMET INFO: devices to report: ['GPU-ba82732c-f75d-0cd8-1d2f-4b5ecd1b399a']
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
COMET INFO: CUDA_VISIBLE_DEVICES is unset, defaulting to all devices
COMET INFO: devices to report: ['GPU-ba82732c-f75d-0cd8-1d2f-4b5ecd1b399a']
COMET INFO: CUDA_VISIBLE

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6268,0.324624,0.874533,0.872752,0.860648,0.885201,0.946032,0.946032,0.942305
1,0.3013,0.261328,0.896067,0.898045,0.915589,0.88116,0.961111,0.961111,0.95906
2,0.2515,0.236621,0.908267,0.909019,0.916656,0.901508,0.968355,0.968355,0.966634
3,0.2226,0.219329,0.915733,0.914606,0.902654,0.926879,0.97328,0.97328,0.970857
4,0.1924,0.201566,0.920667,0.920337,0.916656,0.924049,0.97642,0.97642,0.974286
5,0.1728,0.199575,0.922667,0.923924,0.939325,0.909021,0.978054,0.978054,0.975617
6,0.1516,0.278584,0.903133,0.896192,0.836378,0.96522,0.976794,0.976794,0.975874
7,0.1375,0.211076,0.927133,0.926422,0.917589,0.935427,0.979339,0.979339,0.977999
8,0.1174,0.263788,0.9146,0.911023,0.874517,0.95071,0.976773,0.976773,0.975895
9,0.104,0.292407,0.9128,0.908736,0.868382,0.953022,0.976241,0.976241,0.975414


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6207,0.324023,0.872933,0.876937,0.905587,0.850044,0.945309,0.945309,0.939285
1,0.2974,0.26101,0.8968,0.894363,0.87385,0.915863,0.961364,0.961364,0.958313
2,0.2517,0.230801,0.9096,0.909455,0.908121,0.910793,0.96803,0.96803,0.964966
3,0.2227,0.2122,0.918867,0.918064,0.909188,0.927114,0.973654,0.973654,0.970412
4,0.1997,0.202536,0.922133,0.921262,0.911188,0.931561,0.976417,0.976417,0.97332
5,0.1775,0.210323,0.9238,0.92243,0.906254,0.939193,0.97798,0.97798,0.974917
6,0.1516,0.215086,0.925267,0.923224,0.898787,0.949028,0.979226,0.979226,0.976722
7,0.1364,0.25984,0.9186,0.915543,0.882518,0.951135,0.978295,0.978295,0.976034
8,0.1192,0.312309,0.9062,0.900685,0.85078,0.956809,0.976272,0.976272,0.974489
9,0.1063,0.308808,0.911,0.90627,0.860648,0.956999,0.977276,0.977276,0.97533


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6187,0.32407,0.874267,0.875462,0.883985,0.867103,0.94644,0.94644,0.942539
1,0.3018,0.267643,0.896,0.897932,0.915055,0.881439,0.960862,0.960862,0.958917
2,0.2481,0.236298,0.906667,0.907493,0.915722,0.899411,0.967165,0.967165,0.965396
3,0.2269,0.226183,0.912533,0.911231,0.897986,0.924873,0.970841,0.970841,0.969055
4,0.2066,0.227623,0.913333,0.910812,0.885185,0.937968,0.974164,0.974164,0.972289
5,0.1793,0.207397,0.9206,0.919205,0.903454,0.935515,0.976579,0.976579,0.974442
6,0.1608,0.226605,0.917667,0.915405,0.891052,0.941127,0.976812,0.976812,0.974951
7,0.1373,0.21072,0.924133,0.924476,0.928791,0.920201,0.977216,0.977216,0.975617
8,0.12,0.235197,0.924,0.923305,0.915055,0.931704,0.976292,0.976292,0.974779
9,0.1071,0.329142,0.911867,0.907939,0.869316,0.950153,0.973528,0.973528,0.972567


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6207,0.325504,0.8736,0.876595,0.897986,0.856198,0.944432,0.944432,0.93836
1,0.3041,0.262938,0.8956,0.893744,0.87825,0.909794,0.960817,0.960817,0.958782
2,0.2529,0.23417,0.9096,0.910115,0.915455,0.904837,0.966808,0.966808,0.964169
3,0.227,0.220139,0.9134,0.912176,0.899587,0.925123,0.971261,0.971261,0.968458
4,0.2024,0.214711,0.919467,0.919057,0.914522,0.923636,0.974632,0.974632,0.971443
5,0.1846,0.233772,0.9158,0.912017,0.872916,0.954784,0.977221,0.977221,0.973996
6,0.1587,0.207611,0.925933,0.925049,0.914255,0.9361,0.977231,0.977231,0.974264
7,0.1385,0.232605,0.921933,0.919668,0.893853,0.947019,0.977467,0.977467,0.97451
8,0.1259,0.274683,0.918733,0.915459,0.880117,0.953757,0.97761,0.97761,0.975202
9,0.1086,0.268684,0.922933,0.921446,0.904121,0.939449,0.976374,0.976374,0.974241


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


demo_human_or_worm


  0%|          | 0/100000 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/100 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

loading weights file pyto

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6429,0.215421,0.930867,0.928841,0.90252,0.956743,0.981681,0.981681,0.983224
1,0.1877,0.14255,0.949467,0.948963,0.939725,0.958384,0.989264,0.989264,0.989899
2,0.1291,0.120158,0.956133,0.956481,0.964262,0.948826,0.99215,0.99215,0.992649
3,0.1075,0.108833,0.9592,0.959287,0.961462,0.957122,0.99334,0.99334,0.993764
4,0.0907,0.106786,0.962733,0.962304,0.95146,0.973397,0.994065,0.994065,0.994434
5,0.0775,0.104513,0.961733,0.962032,0.969729,0.954456,0.993999,0.993999,0.994389
6,0.0647,0.131376,0.960267,0.959741,0.947326,0.972485,0.99319,0.99319,0.993641
7,0.054,0.1527,0.954133,0.952558,0.921056,0.986292,0.993245,0.993245,0.99385
8,0.047,0.136166,0.962067,0.961891,0.957594,0.966227,0.992977,0.992977,0.993222
9,0.0368,0.164856,0.958667,0.958924,0.965062,0.952864,0.992548,0.992548,0.992902


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6407,0.220134,0.9314,0.92967,0.906921,0.953589,0.980463,0.980463,0.982533
1,0.1892,0.151869,0.9482,0.946865,0.92319,0.971786,0.989365,0.989365,0.990173
2,0.1296,0.1161,0.957,0.957174,0.961195,0.953187,0.992233,0.992233,0.992817
3,0.1046,0.1063,0.9624,0.962132,0.955461,0.968898,0.993541,0.993541,0.993951
4,0.094,0.121842,0.962067,0.961619,0.950527,0.972973,0.993783,0.993783,0.994287
5,0.0798,0.11727,0.959733,0.958984,0.941592,0.977031,0.993347,0.993347,0.993936
6,0.0683,0.112773,0.961667,0.961448,0.956127,0.966828,0.993776,0.993776,0.994094
7,0.0555,0.132312,0.962133,0.961946,0.957328,0.966608,0.99342,0.99342,0.99382
8,0.0457,0.147588,0.960533,0.95994,0.945859,0.974447,0.993527,0.993527,0.993928


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6194,0.239922,0.924467,0.924562,0.925857,0.923271,0.975236,0.975236,0.976048
1,0.2019,0.14599,0.949267,0.948431,0.933191,0.964177,0.988502,0.988502,0.989489
2,0.1323,0.118514,0.956333,0.95612,0.951594,0.960689,0.991545,0.991545,0.992234
3,0.11,0.116343,0.958067,0.958621,0.971596,0.945988,0.993064,0.993064,0.993583
4,0.0917,0.116634,0.960733,0.961114,0.970663,0.951752,0.993571,0.993571,0.993981
5,0.082,0.104252,0.963733,0.963534,0.958394,0.968729,0.993924,0.993924,0.994322
6,0.0691,0.117111,0.964067,0.963774,0.956127,0.971545,0.993718,0.993718,0.994217
7,0.0565,0.121745,0.960267,0.960618,0.969329,0.952063,0.993416,0.993416,0.993764
8,0.0483,0.161201,0.959267,0.958658,0.944659,0.973077,0.992304,0.992304,0.993224
9,0.0392,0.125952,0.960267,0.960304,0.961328,0.959281,0.992521,0.992521,0.992775


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6355,0.225962,0.926933,0.927436,0.933991,0.920973,0.978013,0.978013,0.979138
1,0.1957,0.143091,0.95,0.949624,0.942659,0.956692,0.988539,0.988539,0.989522
2,0.1353,0.122428,0.9548,0.95523,0.964529,0.946109,0.991259,0.991259,0.991826
3,0.1076,0.109566,0.961533,0.961163,0.952127,0.970372,0.993241,0.993241,0.993676
4,0.093,0.102677,0.962867,0.96267,0.957728,0.967664,0.993684,0.993684,0.994065
5,0.0783,0.105661,0.9618,0.962061,0.968796,0.955418,0.993917,0.993917,0.994209
6,0.0684,0.13127,0.9634,0.963417,0.963995,0.96284,0.993938,0.993938,0.994184
7,0.0573,0.13383,0.960933,0.961125,0.965995,0.956304,0.993072,0.993072,0.993152
8,0.0478,0.141131,0.9612,0.961122,0.959328,0.962923,0.993149,0.993149,0.993423
9,0.0409,0.156979,0.959267,0.959726,0.970796,0.948905,0.992991,0.992991,0.993301


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6445,0.224111,0.924467,0.921259,0.883851,0.961974,0.98058,0.98058,0.982464
1,0.1854,0.153542,0.9458,0.944258,0.918256,0.971775,0.988971,0.988971,0.989628
2,0.1273,0.114404,0.9598,0.959473,0.95186,0.967209,0.992157,0.992157,0.992447
3,0.1063,0.107754,0.961333,0.961308,0.960795,0.961821,0.993413,0.993413,0.993793
4,0.0905,0.105966,0.9612,0.961313,0.964262,0.958383,0.993322,0.993322,0.99377
5,0.0794,0.111265,0.9626,0.962766,0.967196,0.958377,0.99381,0.99381,0.994285
6,0.0655,0.120035,0.962467,0.962167,0.954661,0.969791,0.993748,0.993748,0.994131
7,0.0551,0.123328,0.959867,0.960499,0.975997,0.945485,0.993202,0.993202,0.993543
8,0.0447,0.151023,0.960933,0.960379,0.94706,0.974078,0.993318,0.993318,0.993853
9,0.0384,0.142963,0.961667,0.961531,0.958261,0.964823,0.99329,0.99329,0.993611


***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-937
Configuration saved in outputs/checkpoint-937/config.json
Model weights saved in outputs/checkpoint-937/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-937/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-937/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1874
Configuration saved in outputs/checkpoint-1874/config.json
Model weights saved in outputs/checkpoint-1874/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1874/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1874/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 15000
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2811
Configuration saved in outputs/checkpoint-2811/config.json
Model weights saved i

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


human_enhancers_cohn


  0%|          | 0/27791 [00:00<?, ?ex/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

loading weights file pyto

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,No log,0.621762,0.662269,0.649228,0.620476,0.680773,0.729653,0.729653,0.734542
2,0.636800,0.57971,0.69561,0.734907,0.837619,0.654633,0.785121,0.785121,0.785881
3,0.636800,0.543244,0.723435,0.709645,0.670952,0.753073,0.804551,0.804551,0.804379
4,0.545800,0.535621,0.725594,0.742458,0.785238,0.704099,0.809431,0.809431,0.807602
5,0.545800,0.537358,0.727752,0.714609,0.676667,0.757059,0.814192,0.814192,0.815135
6,0.505400,0.555676,0.717678,0.747262,0.828571,0.680485,0.814277,0.814277,0.81486
7,0.505400,0.566483,0.728232,0.728232,0.722857,0.733688,0.813637,0.813637,0.816963
8,0.465500,0.574008,0.712401,0.690101,0.635714,0.754664,0.804341,0.804341,0.810328
9,0.465500,0.604864,0.718398,0.735466,0.777143,0.698033,0.800882,0.800882,0.805049


***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-261
Configuration saved in outputs/checkpoint-261/config.json
Model weights saved in outputs/checkpoint-261/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-261/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-261/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-522
Configuration saved in outputs/checkpoint-522/config.json
Model weights saved in outputs/checkpoint-522/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-522/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-522/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-783
Configuration saved in outputs/checkpoint-783/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,No log,0.628999,0.663708,0.636222,0.58381,0.698974,0.734106,0.734106,0.738179
2,0.635800,0.555131,0.715519,0.710449,0.692857,0.728958,0.793794,0.793794,0.789777
3,0.635800,0.54481,0.719837,0.697879,0.642381,0.763873,0.806376,0.806376,0.803653
4,0.542500,0.53931,0.724154,0.718275,0.698095,0.739657,0.805115,0.805115,0.807136
5,0.542500,0.53378,0.735908,0.731398,0.71381,0.749875,0.813072,0.813072,0.812405
6,0.500300,0.54458,0.729671,0.754091,0.822857,0.695932,0.8187,0.8187,0.815878
7,0.500300,0.548472,0.723675,0.698902,0.636667,0.774623,0.81154,0.81154,0.8125
8,0.460300,0.563326,0.727752,0.716037,0.681429,0.754349,0.811165,0.811165,0.810649
9,0.460300,0.568579,0.733269,0.737364,0.743333,0.73149,0.807543,0.807543,0.806004
10,0.409500,0.618629,0.718398,0.703535,0.663333,0.748925,0.794917,0.794917,0.790879


***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-261
Configuration saved in outputs/checkpoint-261/config.json
Model weights saved in outputs/checkpoint-261/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-261/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-261/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-522
Configuration saved in outputs/checkpoint-522/config.json
Model weights saved in outputs/checkpoint-522/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-522/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-522/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-783
Configuration saved in outputs/checkpoint-783/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,No log,0.629157,0.655073,0.630524,0.584286,0.68471,0.724955,0.724955,0.726371
2,0.639700,0.567011,0.703766,0.714187,0.734762,0.694732,0.780946,0.780946,0.782946
3,0.639700,0.547895,0.71456,0.739834,0.805714,0.683913,0.801413,0.801413,0.802515
4,0.543800,0.536333,0.725594,0.727619,0.727619,0.727619,0.810063,0.810063,0.810051
5,0.543800,0.531435,0.733269,0.739944,0.753333,0.727022,0.812562,0.812562,0.812671
6,0.509600,0.546205,0.723915,0.744279,0.797619,0.697626,0.812878,0.812878,0.81334
7,0.509600,0.529515,0.726793,0.722804,0.707143,0.739174,0.814755,0.814755,0.815558
8,0.469100,0.563435,0.71408,0.746706,0.836667,0.674213,0.810184,0.810184,0.812889
9,0.469100,0.580916,0.719357,0.747082,0.822857,0.684086,0.807403,0.807403,0.809161
10,0.410900,0.638354,0.702327,0.740539,0.843333,0.660082,0.800056,0.800056,0.803816


***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-261
Configuration saved in outputs/checkpoint-261/config.json
Model weights saved in outputs/checkpoint-261/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-261/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-261/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-522
Configuration saved in outputs/checkpoint-522/config.json
Model weights saved in outputs/checkpoint-522/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-522/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-522/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-783
Configuration saved in outputs/checkpoint-783/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,No log,0.633739,0.65987,0.647614,0.620476,0.677235,0.721929,0.721929,0.725756
2,0.638600,0.557656,0.712161,0.711122,0.703333,0.719085,0.790831,0.790831,0.789988
3,0.638600,0.551359,0.717918,0.749147,0.83619,0.678516,0.805662,0.805662,0.803944
4,0.541800,0.530373,0.730391,0.740894,0.765238,0.718052,0.813472,0.813472,0.812645
5,0.541800,0.537466,0.725834,0.752437,0.827143,0.690107,0.813525,0.813525,0.811698
6,0.505000,0.537762,0.723915,0.745072,0.800952,0.69648,0.815,0.815,0.817355
7,0.505000,0.548366,0.735908,0.73742,0.73619,0.738653,0.815318,0.815318,0.816934
8,0.456100,0.581195,0.720796,0.720327,0.71381,0.726964,0.803522,0.803522,0.805419
9,0.456100,0.593675,0.723435,0.732545,0.751905,0.714156,0.802442,0.802442,0.800004


***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-261
Configuration saved in outputs/checkpoint-261/config.json
Model weights saved in outputs/checkpoint-261/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-261/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-261/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-522
Configuration saved in outputs/checkpoint-522/config.json
Model weights saved in outputs/checkpoint-522/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-522/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-522/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-783
Configuration saved in outputs/checkpoint-783/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "tr

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
1,No log,0.62117,0.665387,0.662309,0.651429,0.67356,0.730785,0.730785,0.738024
2,0.636900,0.595075,0.67858,0.735596,0.887619,0.628032,0.784015,0.784015,0.783512
3,0.636900,0.550493,0.719357,0.707061,0.672381,0.745512,0.798895,0.798895,0.798898
4,0.546300,0.550223,0.716959,0.746998,0.829524,0.679407,0.80944,0.80944,0.806813
5,0.546300,0.586965,0.708083,0.753494,0.885714,0.655622,0.812277,0.812277,0.811467
6,0.506500,0.552828,0.720317,0.694605,0.631429,0.771828,0.813877,0.813877,0.814987
7,0.506500,0.564251,0.721276,0.691614,0.620476,0.781175,0.8152,0.8152,0.816799
8,0.467600,0.613835,0.704485,0.745033,0.857143,0.658858,0.810677,0.810677,0.809389
9,0.467600,0.615047,0.717678,0.744741,0.817619,0.683791,0.80517,0.80517,0.804611


***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-261
Configuration saved in outputs/checkpoint-261/config.json
Model weights saved in outputs/checkpoint-261/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-261/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-261/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-522
Configuration saved in outputs/checkpoint-522/config.json
Model weights saved in outputs/checkpoint-522/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-522/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-522/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4169
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-783
Configuration saved in outputs/checkpoint-783/config.json
Model weights saved in outputs/

early stopping required metric_for_best_model, but did not find eval_loss so early stopping is disabled


human_enhancers_ensembl


  0%|          | 0/123872 [00:00<?, ?ex/s]

  0%|          | 0/124 [00:00<?, ?ba/s]

  0%|          | 0/124 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--simecek--DNADebertaK6_Fruitfly/snapshots/3aafc682f807513dadea16f3e8c5838d56f486f9/config.json
Model config DebertaConfig {
  "_name_or_path": "simecek/DNADebertaK6_Fruitfly",
  "architectures": [
    "DebertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

loading weights file pyto

Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision,Rocauc 0 Roc Auc,Rocauc 1 Roc Auc,Pr Auc
0,0.6052,0.5094,0.756095,0.762474,0.778301,0.747278,0.83576,0.83576,0.831953
1,0.4785,0.455495,0.785964,0.784713,0.775519,0.794127,0.869926,0.869926,0.8699
2,0.4338,0.414407,0.80846,0.804954,0.785791,0.825076,0.893502,0.893502,0.895071
3,0.3839,0.390669,0.819493,0.815451,0.792853,0.839375,0.905532,0.905532,0.90843
4,0.3527,0.346433,0.846187,0.850163,0.867537,0.83347,0.927271,0.927271,0.929408


***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-1161
Configuration saved in outputs/checkpoint-1161/config.json
Model weights saved in outputs/checkpoint-1161/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1161/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1161/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-2322
Configuration saved in outputs/checkpoint-2322/config.json
Model weights saved in outputs/checkpoint-2322/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-2322/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-2322/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 18581
  Batch size = 32
Saving model checkpoint to outputs/checkpoint-3483
Configuration saved in outputs/checkpoint-3483/config.json
Model weights sa

## Outputs

In [None]:
outputs_df = pd.DataFrame(outputs)
outputs_df

In [None]:
# outputs_df.groupby('dataset').agg({'accuracy' : ['mean', 'sem'], 'f1' : ['mean','sem'], 'train_runtime': ['mean', 'sem']})

In [None]:
# saving outputs to csv file
outputs_df.to_csv(OUTPUT_PATH, index=False)