In [None]:
!pip install transformers datasets comet-ml

In [2]:
PARAM_ORG = "Fruitfly"
PARAM_DATASET = "davidcechak/Fruitfly_DNA_v0_DNABert6tokenized"
STEPS = 25000

In [1]:
import os

os.environ['COMET_API_KEY'] = "<COMET_API_KEY>"

import comet_ml

# Commet Init
comet_ml.init(project_name="Experiment_DNA_Organisms", api_key= "<COMET_API_KEY>")

COMET INFO: Comet API key is valid
COMET INFO: Comet API key saved in /root/.comet.config


In [3]:
import torch 

torch.cuda.get_device_name(0)

'Tesla V100-SXM2-16GB'

In [4]:
from datasets import load_dataset
datasets = load_dataset(PARAM_DATASET)
datasets.set_format("torch")
datasets

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Using custom data configuration davidcechak--Fruitfly_DNA_v0_DNABert6tokenized-fafb8f7c130558e6


Downloading and preparing dataset None/None (download: 35.25 MiB, generated: 130.09 MiB, post-processed: Unknown size, total: 165.34 MiB) to /root/.cache/huggingface/datasets/davidcechak___parquet/davidcechak--Fruitfly_DNA_v0_DNABert6tokenized-fafb8f7c130558e6/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/33.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/davidcechak___parquet/davidcechak--Fruitfly_DNA_v0_DNABert6tokenized-fafb8f7c130558e6/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 39807
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4423
    })
})

In [5]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, DataCollatorForLanguageModeling

tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

tokenizer.decode([0,1,2,3,4,5])

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

'[PAD] [UNK] [CLS] [SEP] [MASK] AAAAAA'

## 1) Training

In [6]:
from transformers import DebertaConfig, DebertaForMaskedLM, TrainingArguments, Trainer

model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=6)
model_config

DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": null,
  "position_biased_input": true,
  "relative_attention": false,
  "transformers_version": "4.19.3",
  "type_vocab_size": 0,
  "vocab_size": 4101
}

In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

model = DebertaForMaskedLM(config=model_config)
sum(p.numel() for p in model.parameters()) // 10**6

46

In [8]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [9]:
training_args = TrainingArguments(
    output_dir='./model',          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    max_steps=25000,            # number of steps - to be the same
    per_device_train_batch_size=32, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=2,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    logging_steps=5000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=5000,
    fp16=True,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=3,           # whether you don't have much space so you let only 5 model weights saved in the disk
    push_to_hub=True,
    hub_model_id=f"{PARAM_ORG}DNADeberta",
    hub_strategy="every_save"
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=datasets['train'],
    eval_dataset=datasets['test'],
)

Cloning https://huggingface.co/simecek/FruitflyDNADeberta into local empty directory.
max_steps is given, it will override any value given in num_train_epochs
Using amp half precision backend


In [11]:
trainer.train()

***** Running training *****
  Num examples = 39807
  Num Epochs = 41
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 25000
COMET ERROR: Failed to calculate active processors count. Fall back to default CPU count 1
COMET INFO: Couldn't find a Git repository in '/content' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/simecek/experiment-dna-organisms/4296283634204a97922d9de33f606633

Automatic Comet.ml online logging enabled


Step,Training Loss,Validation Loss
5000,7.4963,7.470438
10000,7.4505,7.452871
15000,7.4399,7.44567
20000,7.3946,7.353472
25000,7.3354,7.337749


***** Running Evaluation *****
  Num examples = 4423
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-5000
Configuration saved in ./model/checkpoint-5000/config.json
Model weights saved in ./model/checkpoint-5000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-10000
Configuration saved in ./model/checkpoint-10000/config.json
Model weights saved in ./model/checkpoint-10000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-15000
Configuration saved in ./model/checkpoint-15000/config.json
Model weights saved in ./model/checkpoint-15000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4423
  Batch size = 32
Saving model checkpoint to ./model/checkpoint-20000
Configuration saved in ./model/checkpoint-20000/config.json
Model weights saved in ./model/checkpoint-20000/pytorch_model.bin
Deletin

TrainOutput(global_step=25000, training_loss=7.4233246875, metrics={'train_runtime': 14543.4421, 'train_samples_per_second': 110.015, 'train_steps_per_second': 1.719, 'total_flos': 2.1194003737939968e+17, 'train_loss': 7.4233246875, 'epoch': 40.19})

In [None]:
import torch 
torch.cuda.empty_cache() 