In [1]:
# PARAMETERS

K = 8
DATASET = "simecek/Human_DNA_v0_K8tokenized_stride1"
HF_MODEL_NAME = "DNADebertaK8"

LR = 5e-05
WD = 0.1

BATCH_SIZE = 16
ACCUMULATION = 4
HIDDEN_LAYERS = 6
MLM_P = 0.15
EPOCHS = 15

# you need to download a script for custom collator
#!wget http://raw.githubusercontent.com/ML-Bioinfo-CEITEC/cDNA-pretraining/main/experiments/custom_masking/custom_collator.py

## 0) Loading

In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import os

os.environ['COMET_API_KEY'] = "uckBYzSuTQ0uUKVA42536dD7Q"

import comet_ml

# Commet Init
comet_ml.init(project_name="Training_20220701", api_key="uckBYzSuTQ0uUKVA42536dD7Q")

COMET INFO: Comet API key is valid
COMET INFO: Comet API key saved in /home/jovyan/.comet.config


In [3]:
from datasets import load_dataset
datasets = load_dataset(DATASET)
datasets.set_format("torch")
datasets

Using custom data configuration simecek--Human_DNA_v0_K8tokenized_stride1-d827c651d3e73e53
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_K8tokenized_stride1-d827c651d3e73e53/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5146887
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 571887
    })
})

In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from itertools import product

tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

alphabet = ('A', 'C', 'T', 'G')
vocab = list(map(''.join, product(alphabet, repeat=K)))

tokenizer.add_tokens(vocab)

65536

## 1) Training

In [5]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=HIDDEN_LAYERS)
model_config

DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "transformers_version": "4.19.2",
  "type_vocab_size": 0,
  "vocab_size": 69637
}

In [6]:
from custom_collator import WideCollator

data_collator = WideCollator(area=K, tokenizer=tokenizer, mlm=True, mlm_probability=MLM_P, mask_fully=True)

model = DebertaForMaskedLM(config=model_config)
sum(p.numel() for p in model.parameters()) // 10**6

97

In [7]:
training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=BATCH_SIZE, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=ACCUMULATION,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=BATCH_SIZE,  # evaluation batch size
    logging_steps=20000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=20000,
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1,           # whether you don't have much space so you let only 5 model weights saved in the disk
    push_to_hub=True,
    hub_model_id=HF_MODEL_NAME,
    hub_strategy="every_save",
    learning_rate=LR,
    weight_decay=WD
)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=datasets['train'],
    eval_dataset=datasets['test'],
)

Cloning https://huggingface.co/simecek/DNADebertaK8 into local empty directory.
Using amp half precision backend


In [9]:
trainer.train()

***** Running training *****
  Num examples = 5146887
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 1206300
COMET INFO: Couldn't find a Git repository in '/home/jovyan' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/simecek/training-20220701/9e7e7d58925c43718c868127bdda5ffe

Automatic Comet.ml online logging enabled


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/simecek/training-20220701/9e7e7d58925c43718c868127bdda5ffe
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     loss [19] : (2.73148512840271, 2.8329038619995117)
COMET INFO:   Others:
COMET INFO:     Created from : transformers
COMET INFO:   Parameters:
COMET INFO:     args/_n_gpu                             : 1
COMET INFO:     args/_no_sync_in_gradient_accumulation  : True
COMET INFO:     args/_setup_devices                     : cuda:0
COMET INFO:     args/adafactor                          : False
COMET INFO:     args/adam_beta1                         : 0.9
COMET INFO:     args/adam_beta2                         : 0.999
COMET INFO:     args/adam_epsilon                       : 1e-08
COMET INFO:     args/auto_find_batch_size              

In [11]:
import torch 
#torch.cuda.empty_cache() 