# SETUP

In [1]:
import comet_ml
api_key = ""
project_name="CDNA_BERT"
comet_ml.init(project_name=project_name, api_key=api_key)

COMET INFO: Comet API key is valid
COMET INFO: Comet API key saved in /home/jovyan/.comet.config


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, DataCollatorForLanguageModeling, TextDataset
from transformers import DistilBertConfig, DistilBertForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import DebertaConfig, DebertaForMaskedLM


tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")


# OPTUNA HYPERPARAM TUNING

In [3]:
#Install in terminal and confirm 
# !conda install -n YOUR_CONDA_ENV optuna
# !conda install -n YOUR_CONDA_ENV -c plotly plotly=5.8.2
# !conda install -n YOUR_CONDA_ENV "jupyterlab>=3" "ipywidgets>=7.6"
# !conda install -n YOUR_CONDA_ENV scikit-learn -c conda-forge


In [4]:
from transformers import TrainerCallback
#Optional part for pruning of experiments

class PruningLogCallback(TrainerCallback):
    def __init__(self, trial):
        self.step = 0
        self.trial = trial
    
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        eval_loss = metrics['eval_loss']
        current_step = self.step
        self.step = self.step+1
        self.trial.report(eval_loss, current_step)
        
        if(self.trial.should_prune()):
            raise optuna.TrialPruned()


In [9]:
# How to define search spaces
# https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html
# https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html#optuna.trial.Trial.suggest_float


log_once_every_x_sequences = 64000

def objective(trial):
    num_train_epochs = 1
    hidden_layers = trial.suggest_int('num_hidden_layers', low=1, high=12, step=1)
    learning_rate = trial.suggest_float('learning_rate', low=1e-5, high=1e-1, log=True)
    weight_decay = trial.suggest_float('weight_decay', low=0, high=0.3)
    mlm_probability = trial.suggest_float('mlm_probability', low=0.05, high=0.5, step=0.05)
    batch_size = trial.suggest_categorical('batch_size', [8,16,32,64,128,256,512])
    
    logging_steps = (log_once_every_x_sequences/batch_size)
    if(batch_size <=64):
        accumulation_steps = 1 
    else:
        accumulation_steps = batch_size/64
        batch_size = 64
    
    model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=hidden_layers)
    model = DebertaForMaskedLM(config=model_config)
    model.init_weights()
    
    training_args = TrainingArguments(
            output_dir='./model',
            overwrite_output_dir=True,
            evaluation_strategy = "steps",
            save_strategy = "steps",
            learning_rate=learning_rate,
            weight_decay=weight_decay, 
            push_to_hub=False,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            gradient_accumulation_steps=accumulation_steps,
            num_train_epochs=num_train_epochs,
            save_total_limit=1,
            # load_best_model_at_end=True,
            logging_steps=logging_steps,       
            # save_steps=5000,
            fp16=True,
            # warmup_steps=1000,
    )

    #getting only 2% of datasets for faster hyperopt demonstration
    # train_dset = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized", split='train[:2%]')
    # test_dset = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized", split='test[:2%]')
    
    train_dset = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized", split='train')
    test_dset = load_dataset("simecek/Human_DNA_v0_DNABert6tokenized", split='test')
    

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=mlm_probability
    )
    trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dset,
            eval_dataset=test_dset,
            callbacks=[PruningLogCallback(trial)],
    )
    
    train_loss = trainer.train().training_loss
    eval_loss = trainer.evaluate()['eval_loss']
    
    #Optimizing for validation loss
    return eval_loss



In [None]:
import optuna
import logging
import sys
# pruner doc https://optuna.readthedocs.io/en/stable/reference/generated/optuna.pruners.MedianPruner.html#optuna.pruners.MedianPruner
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

study=optuna.create_study(
    study_name="my_first_hyperparameter_search", 
    direction='minimize', 
    pruner=optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=0) #n_startup_trials=5 as a default
)
# n_trials dictates the total number of runs (one hyperparam combination = one run)
study.optimize(func=objective, n_trials=20) #n_trials=10
print(study.best_value)
print(study.best_params)
print(study.best_trial)


[32m[I 2022-06-21 17:05:39,790][0m A new study created in memory with name: my_first_hyperparameter_search[0m


A new study created in memory with name: my_first_hyperparameter_search
A new study created in memory with name: my_first_hyperparameter_search
A new study created in memory with name: my_first_hyperparameter_search


using `logging_steps` to initialize `eval_steps` to 250.0
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using custom data configuration simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)
Using custom data configuration simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_DNABert6tokenized-9a684042f2db6cd1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)
Using amp half precision backend
***** Run

Step,Training Loss,Validation Loss


In [None]:
print(study.best_params)


In [None]:
import plotly.io as pio
pio.renderers.default = "iframe"

In [None]:
hyperparameters = ['num_hidden_layers', 'learning_rate','weight_decay', 'mlm_probability', 'batch_size']
fig = optuna.visualization.plot_parallel_coordinate(study, params=hyperparameters)
fig.show()

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()