In [2]:
# PARAMETERS

DATASET = "Vlasta/Human_DNA_v0_SentencepieceTokenized_vocab10k"
HF_MODEL_NAME = "DNADebertaSentencepiece10k"

LR = 5e-05
WD = 0.1

BATCH_SIZE = 16
ACCUMULATION = 4
HIDDEN_LAYERS = 6
MLM_P = 0.15
EPOCHS = 15

## 0) Loading

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import os

os.environ['COMET_API_KEY'] = "uckBYzSuTQ0uUKVA42536dD7Q"

import comet_ml

# Commet Init
comet_ml.init(project_name="Training_20220728", api_key="uckBYzSuTQ0uUKVA42536dD7Q")

COMET INFO: Comet API key is valid
COMET INFO: Comet API key saved in /home/jovyan/.comet.config


In [5]:
from datasets import load_dataset
datasets = load_dataset(DATASET)
datasets.set_format("torch")
datasets

Using custom data configuration Vlasta--Human_DNA_v0_SentencepieceTokenized_vocab10k-440a782332c6860b
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/Vlasta___parquet/Vlasta--Human_DNA_v0_SentencepieceTokenized_vocab10k-440a782332c6860b/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 98416
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 885806
    })
})

In [6]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("Vlasta/DNA_Sentencepiece_vocab_10000_max_tokenlen_45")

## 1) Training

In [7]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=HIDDEN_LAYERS)
model_config

DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "transformers_version": "4.19.2",
  "type_vocab_size": 0,
  "vocab_size": 10000
}

In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=MLM_P)

model = DebertaForMaskedLM(config=model_config)
sum(p.numel() for p in model.parameters()) // 10**6

51

In [9]:
tokenizer.set_truncation_and_padding

<bound method PreTrainedTokenizerFast.set_truncation_and_padding of PreTrainedTokenizerFast(name_or_path='Vlasta/DNA_Sentencepiece_vocab_10000_max_tokenlen_45', vocab_size=9999, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})>

In [10]:
training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=EPOCHS,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=BATCH_SIZE, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=ACCUMULATION,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=BATCH_SIZE,  # evaluation batch size
    logging_steps=5000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=5000,
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1,           # whether you don't have much space so you let only 5 model weights saved in the disk
    push_to_hub=True,
    hub_model_id=HF_MODEL_NAME,
    hub_strategy="every_save",
    learning_rate=LR,
    weight_decay=WD
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=datasets['train'],
    eval_dataset=datasets['test'],
)

Cloning https://huggingface.co/Vlasta/DNADebertaSentencepiece10k into local empty directory.
Using amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 885806
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 4
  Total optimization steps = 207600
COMET INFO: Experiment is live on comet.ml https://www.comet.com/simecek/training-20220728/d8584313bb964aa9a40d50417f83bdc7

Automatic Comet.ml online logging enabled


Step,Training Loss,Validation Loss
5000,7.1504,7.060384


***** Running Evaluation *****
  Num examples = 98416
  Batch size = 16
Saving model checkpoint to ./model/checkpoint-5000
Configuration saved in ./model/checkpoint-5000/config.json
Model weights saved in ./model/checkpoint-5000/pytorch_model.bin


In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()