In [1]:
# SOURCES
# https://srijithr.gitlab.io/post/comet/
# https://huggingface.co/blog/ray-tune

In [2]:
import comet_ml
api_key = "YOUR API KEY"
project_name="CDNA_BERT"
comet_ml.init(project_name=project_name, api_key=api_key)

COMET INFO: Comet API key is valid
COMET INFO: Comet API key saved in /home/jovyan/.comet.config


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, DataCollatorForLanguageModeling, TextDataset
from transformers import DistilBertConfig, DistilBertForMaskedLM
from transformers import TrainingArguments, Trainer

tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")
model_config = DistilBertConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=3)
model = DistilBertForMaskedLM(config=model_config)
model.init_weights()
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)


# COMET HYPERPARAM TUNING

In [14]:
#Function to optimize
def run_train(LEARNING_RATE, EPOCHS, WEIGHT_DECAY, callbacks):
    training_args = TrainingArguments(
        output_dir='./model',
        overwrite_output_dir=True,
        evaluation_strategy = "steps",
        save_strategy = "steps",
        learning_rate=LEARNING_RATE, #5e-5, #2e-5 3
        weight_decay=WEIGHT_DECAY, 
        push_to_hub=False,
        per_device_train_batch_size=64,
        # gradient_accumulation_steps=1,
        num_train_epochs=EPOCHS,
        save_total_limit=1,
        load_best_model_at_end=True,
        logging_steps=1000,       
        save_steps=5000,
        fp16=True,
        # warmup_steps=1000,
    )
    
    #getting only 2% of datasets for faster hyperopt demonstration
    train_dset = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized", split='train[:2%]')
    test_dset = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized", split='test[:2%]')
    
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dset,
        eval_dataset=test_dset,
        callbacks=callbacks,
        )
    train_loss = trainer.train().training_loss
    eval_loss = trainer.evaluate()['eval_loss']
    return train_loss, eval_loss

In [6]:
from comet_ml import Optimizer, ExistingExperiment
#DOCUMENTAITON how to setup: https://www.comet.ml/docs/python-sdk/introduction-optimizer/

config = {
    # We pick the Bayes algorithm:
    "algorithm": "bayes",

    # Declare your hyperparameters in the Vizier-inspired format:
    "parameters": {
        "LEARNING_RATE":{"type":"discrete", "values":[5e-5, 5e-4, 5e-3, 5e-2]},
        "WEIGHT_DECAY":{"type":"float", "min":0.0, "max":0.5, "scalingType":"uniform"},
        "EPOCHS":{"type":"discrete", "values":[1,2,3]},
    },

    # Declare what we will be optimizing, and how:
    "spec": {
      "metric": "valid_loss",
      "objective": "minimize",
      "seed": 1
    },
}
opt = Optimizer(config, project_name=project_name)

COMET INFO: COMET_OPTIMIZER_ID=0b9aa2a6df384f95ab93e457ee70a6d8
COMET INFO: Using optimizer config: {'algorithm': 'bayes', 'configSpaceSize': 'infinite', 'endTime': None, 'id': '0b9aa2a6df384f95ab93e457ee70a6d8', 'lastUpdateTime': None, 'maxCombo': 0, 'name': '0b9aa2a6df384f95ab93e457ee70a6d8', 'parameters': {'EPOCHS': {'type': 'discrete', 'values': [1, 2, 3]}, 'LEARNING_RATE': {'type': 'discrete', 'values': [5e-05, 0.0005, 0.005, 0.05]}, 'WEIGHT_DECAY': {'max': 0.5, 'min': 0.0, 'scalingType': 'uniform', 'type': 'float'}}, 'predictor': None, 'spec': {'gridSize': 10, 'maxCombo': 0, 'metric': 'valid_loss', 'minSampleSize': 100, 'objective': 'minimize', 'retryAssignLimit': 0, 'retryLimit': 1000, 'seed': 1}, 'startTime': 44497338185, 'state': {'mode': None, 'seed': None, 'sequence': [], 'sequence_i': 0, 'sequence_pid': None, 'sequence_retry': 0, 'sequence_retry_count': 0}, 'status': 'running', 'suggestion_count': 0, 'trials': 1, 'version': '2.0.1'}


In [20]:
import functools
import importlib.util
import json
import numbers
import os
import sys
import tempfile
from pathlib import Path
from transformers import TrainerCallback

from transformers.utils import logging

logger = logging.get_logger(__name__)

_has_comet = importlib.util.find_spec("comet_ml") is not None and os.getenv("COMET_MODE", "").upper() != "DISABLED"

class CustomCometCallback(TrainerCallback):
    """
    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.ml/site/).
    """

    def __init__(self, experiment):
        if not _has_comet:
            raise RuntimeError("CometCallback requires comet-ml to be installed. Run `pip install comet-ml`.")
        self._initialized = False
        self._log_assets = False
        self.experiment=experiment

    def setup(self, args, state, model):
        """
        Setup the optional Comet.ml integration.
        Environment:
            COMET_MODE (`str`, *optional*):
                Whether to create an online, offline experiment or disable Comet logging. Can be "OFFLINE", "ONLINE",
                or "DISABLED". Defaults to "ONLINE".
            COMET_PROJECT_NAME (`str`, *optional*):
                Comet project name for experiments
            COMET_OFFLINE_DIRECTORY (`str`, *optional*):
                Folder to use for saving offline experiments when `COMET_MODE` is "OFFLINE"
            COMET_LOG_ASSETS (`str`, *optional*):
                Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be "TRUE", or
                "FALSE". Defaults to "TRUE".
        For a number of configurable items in the environment, see
        [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
        """
        self._initialized = True
        log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
        if log_assets in {"TRUE", "1"}:
            self._log_assets = True
        if state.is_world_process_zero:
            comet_mode = os.getenv("COMET_MODE", "ONLINE").upper()
            experiment = None
            experiment_kwargs = {"project_name": os.getenv("COMET_PROJECT_NAME", "huggingface")}
            if comet_mode == "ONLINE":
                # experiment = comet_ml.Experiment(**experiment_kwargs)
                experiment = self.experiment
                experiment.log_other("Created from", "transformers")
                logger.info("Automatic Comet.ml online logging enabled")
            elif comet_mode == "OFFLINE":
                experiment_kwargs["offline_directory"] = os.getenv("COMET_OFFLINE_DIRECTORY", "./")
                experiment = comet_ml.OfflineExperiment(**experiment_kwargs)
                experiment.log_other("Created from", "transformers")
                logger.info("Automatic Comet.ml offline logging enabled; use `comet upload` when finished")
            if experiment is not None:
                experiment._set_model_graph(model, framework="transformers")
                experiment._log_parameters(args, prefix="args/", framework="transformers")
                if hasattr(model, "config"):
                    experiment._log_parameters(model.config, prefix="config/", framework="transformers")

    def on_train_begin(self, args, state, control, model=None, **kwargs):
        if not self._initialized:
            self.setup(args, state, model)

    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        if not self._initialized:
            self.setup(args, state, model)
        if state.is_world_process_zero:
            experiment = comet_ml.config.get_global_experiment()
            if experiment is not None:
                experiment._log_metrics(logs, step=state.global_step, epoch=state.epoch, framework="transformers")

    def on_train_end(self, args, state, control, **kwargs):
        if self._initialized and state.is_world_process_zero:
            experiment = comet_ml.config.get_global_experiment()
            if experiment is not None:
                if self._log_assets is True:
                    logger.info("Logging checkpoints. This may take time.")
                    experiment.log_asset_folder(
                        args.output_dir, recursive=True, log_file_name=True, step=state.global_step
                    )
                # experiment.end()
                
                
# cb = CustomCometCallback()

In [21]:
for comet_experiment in opt.get_experiments():
    #TODO hf creates their own experiment - its a mess https://github.com/huggingface/transformers/blob/v4.20.0/src/transformers/integrations.py#L664

    #TODO logs into another experiment
    #TODO trainer do_eval to True
    comet_experiment.add_tag('hyperopt experiment')

    train_loss, eval_loss = run_train(
        LEARNING_RATE=comet_experiment.get_parameter('LEARNING_RATE'),
        EPOCHS=comet_experiment.get_parameter('EPOCHS'),
        WEIGHT_DECAY=comet_experiment.get_parameter('WEIGHT_DECAY'),
        callbacks = [CustomCometCallback(comet_experiment)]
        #Add your hyperparams
    )
    # experiment = ExistingExperiment(api_key=api_key, previous_experiment=experiment.get_key())
    # experiment = comet_ml.get_global_experiment()
    
    comet_experiment.log_metric('train_loss', train_loss)
    comet_experiment.log_metric('valid_loss', eval_loss)
    comet_experiment.end()
    

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/vlasta/cdna-bert/3e31881da9c34f68b4dc07b4810c70f6
COMET INFO:   Others:
COMET INFO:     Created from : transformers
COMET INFO:   Parameters:
COMET INFO:     args/_n_gpu                             : 1
COMET INFO:     args/_no_sync_in_gradient_accumulation  : True
COMET INFO:     args/_setup_devices                     : cuda:0
COMET INFO:     args/adafactor                          : False
COMET INFO:     args/adam_beta1                         : 0.9
COMET INFO:     args/adam_beta2                         : 0.999
COMET INFO:     args/adam_epsilon                       : 1e-08
COMET INFO:     args/auto_find_batch_size               : False
COMET INFO:     args/bf16                               : False
COMET INFO:     args/bf16_full_eval          

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/vlasta/cdna-bert/8d4a78941d09487fbc42f3234c5789a4
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     epoch                    : 1.0
COMET INFO:     loss [27]                : (7.985511302947998, 8.048103332519531)
COMET INFO:     total_flos               : 1153441439769600.0
COMET INFO:     train_loss               : 8.020317219447026
COMET INFO:     train_runtime            : 69.7678
COMET INFO:     train_samples_per_second : 246.174
COMET INFO:     train_steps_per_second   : 3.856
COMET INFO:   Others:
COMET INFO:     Created from : transformers
COMET INFO:   Parameters:
COMET INFO:     args/_n_gpu                             : 1
COMET INFO:     args/_no

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/vlasta/cdna-bert/638264e17cf74da4a965ec7b487ae3df

using `logging_steps` to initialize `eval_steps` to 1000
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using custom data configuration simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)
Using custom data configuration simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1/

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

# HF HYPERPARAM TUNING

In [None]:
training_args = TrainingArguments(
        output_dir='./model',
        overwrite_output_dir=True,
        evaluation_strategy = "steps",
        save_strategy = "steps",
        learning_rate=LEARNING_RATE, #5e-5, #2e-5 3
        weight_decay=WEIGHT_DECAY, 
        push_to_hub=False,
        per_device_train_batch_size=64,
        # gradient_accumulation_steps=1,
        num_train_epochs=EPOCHS,
        save_total_limit=1,
        load_best_model_at_end=True,
        logging_steps=1000,       
        save_steps=5000,
        fp16=True,
        # warmup_steps=1000,
)
    
#getting only 2% of datasets for faster hyperopt demonstration
train_dset = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized", split='train[:2%]')
test_dset = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized", split='test[:2%]')
    
trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dset,
        eval_dataset=test_dset,
)

trainer.hyperparameter_search(
    direction="maximize", 
    backend="ray", 
    n_trials=10 # number of trials
)
