# SETUP

In [1]:
#!pip install transformers datasets comet_ml optuna scikit-learn kaleido
#!wget http://raw.githubusercontent.com/ML-Bioinfo-CEITEC/cDNA-pretraining/main/experiments/custom_masking/custom_collator.py

In [2]:
# PARAMETERS

K = 7
DATASET = "simecek/Human_DNA_v0_K7tokenized_stride1"

BATCH_SIZE = 32
MAX_BATCH_SIZE = 16
HIDDEN_LAYERS = 6
MLM_P = 0.15

log_once_every_x_sequences = 64000
THINNING = 10 # percent

In [3]:
import random
import string
run_id = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))
run_id

'KXY6L'

In [4]:
import comet_ml
import os

api_key = "uckBYzSuTQ0uUKVA42536dD7Q"
project_name="Hyperopt"

os.environ['COMET_API_KEY'] = api_key
comet_ml.init(project_name=project_name, api_key=api_key)

COMET INFO: Comet API key is valid
COMET INFO: Comet API key saved in /root/.comet.config


In [5]:
from datasets import load_dataset, ReadInstruction
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import DebertaConfig, DebertaForMaskedLM
from itertools import product

tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

alphabet = ('A', 'C', 'T', 'G')
vocab = list(map(''.join, product(alphabet, repeat=K)))

tokenizer.add_tokens(vocab)

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

16384

# OPTUNA HYPERPARAM TUNING

In [6]:
from custom_collator import WideCollator
from transformers import TrainerCallback
#Optional part for pruning of experiments

class PruningLogCallback(TrainerCallback):
    def __init__(self, trial):
        self.step = 0
        self.trial = trial
    
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        eval_loss = metrics['eval_loss']
        current_step = self.step
        self.step = self.step+1
        self.trial.report(eval_loss, current_step)
        
        if(self.trial.should_prune()):
            raise optuna.TrialPruned()

crashes = []
#getting only X% of datasets for faster hyperopt demonstration
train_dset = load_dataset(DATASET, split=f'train[:{THINNING}%]')
test_dset = load_dataset(DATASET, split=f'test[:{THINNING}%]')
mask_area = K

train_dset, test_dset 

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Using custom data configuration simecek--Human_DNA_v0_K7tokenized_stride1-1775a81e1ff2a82a


Downloading and preparing dataset None/None (download: 4.87 GiB, generated: 16.43 GiB, post-processed: Unknown size, total: 21.29 GiB) to /root/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_K7tokenized_stride1-1775a81e1ff2a82a/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/131M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/131M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/131M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/131M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/146M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/146M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/146M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/146M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_K7tokenized_stride1-1775a81e1ff2a82a/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8. Subsequent calls will reuse this data.


Using custom data configuration simecek--Human_DNA_v0_K7tokenized_stride1-1775a81e1ff2a82a
Reusing dataset parquet (/root/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0_K7tokenized_stride1-1775a81e1ff2a82a/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


(Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 514741
 }), Dataset({
     features: ['input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 57194
 }))

In [7]:
# How to define search spaces
# https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html
# https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html#optuna.trial.Trial.suggest_float

def objective(trial):
    num_train_epochs = 1
    hidden_layers = HIDDEN_LAYERS
    learning_rate = trial.suggest_float('learning_rate', low=1e-5, high=1e-1, log=True)
    weight_decay = trial.suggest_float('weight_decay', low=0, high=0.3)
    mlm_probability = MLM_P
    batch_size = BATCH_SIZE
    
    logging_steps = int(log_once_every_x_sequences/batch_size)
    if batch_size <= MAX_BATCH_SIZE:
        accumulation_steps = 1 
    else:
        accumulation_steps = batch_size / MAX_BATCH_SIZE
        batch_size = MAX_BATCH_SIZE
    
    model_config = DebertaConfig(vocab_size=len(tokenizer.vocab), max_position_embeddings=512, num_hidden_layers=hidden_layers)
    model = DebertaForMaskedLM(config=model_config)
    model.init_weights()
    
    training_args = TrainingArguments(
            output_dir='./model',
            overwrite_output_dir=True,
            evaluation_strategy = "steps",
            save_strategy = "steps",
            learning_rate=learning_rate,
            weight_decay=weight_decay, 
            push_to_hub=False,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            gradient_accumulation_steps=accumulation_steps,
            num_train_epochs=num_train_epochs,
            save_total_limit=1,
            # load_best_model_at_end=True,
            logging_steps=logging_steps,       
            # save_steps=5000,
            fp16=True,
            # warmup_steps=1000,
    )    

    data_collator = WideCollator(area=mask_area, tokenizer=tokenizer, mlm=True, mlm_probability=mlm_probability, mask_fully=True)

    trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dset,
            eval_dataset=test_dset,
            callbacks=[PruningLogCallback(trial)],
    )
    
    try:
        train_loss = trainer.train().training_loss
        eval_loss = trainer.evaluate()['eval_loss']
    except Exception as e:
        crashes.append({'exception':e, 'trial':trial.number})
        raise optuna.TrialPruned()
    
    #Optimizing for validation loss
    return eval_loss



In [None]:
import optuna
import logging
import sys

# pruner doc https://optuna.readthedocs.io/en/stable/reference/generated/optuna.pruners.MedianPruner.html#optuna.pruners.MedianPruner
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

study=optuna.create_study(
    study_name=f"{run_id}_hyperparameter_search", 
    direction='minimize', 
    pruner=optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=0) #n_startup_trials=5 as a default
)
# n_trials dictates the total number of runs (one hyperparam combination = one run)
study.optimize(func=objective, n_trials=30)
print(study.best_value)
print(study.best_params)
print(study.best_trial)


[32m[I 2022-06-30 23:21:34,214][0m A new study created in memory with name: KXY6L_hyperparameter_search[0m


A new study created in memory with name: KXY6L_hyperparameter_search


Using cuda_amp half precision backend
***** Running training *****
  Num examples = 514741
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32.0
  Gradient Accumulation steps = 2.0
  Total optimization steps = 16086
COMET ERROR: Failed to calculate active processors count. Fall back to default CPU count 1
COMET INFO: Couldn't find a Git repository in '/content' nor in any parent directory. You can override where Comet is looking for a Git Patch by setting the configuration `COMET_GIT_DIRECTORY`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/simecek/hyperopt/25c1fa7a57fb43ebadef8be50d324f07

Automatic Comet.ml online logging enabled


Step,Training Loss,Validation Loss


Saving model checkpoint to ./model/checkpoint-500
Configuration saved in ./model/checkpoint-500/config.json
Model weights saved in ./model/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./model/checkpoint-1000
Configuration saved in ./model/checkpoint-1000/config.json
Model weights saved in ./model/checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [model/checkpoint-500] due to args.save_total_limit


In [None]:
print(study.best_params)


{'learning_rate': 0.00033358137370955484, 'weight_decay': 0.23163923816605497}


In [None]:
print(crashes)

[{'exception': TrialPruned(), 'trial': 3}, {'exception': TrialPruned(), 'trial': 4}, {'exception': TrialPruned(), 'trial': 6}, {'exception': TrialPruned(), 'trial': 7}]


In [None]:
import pickle
#file = open(f"{run_id}_HYPEROPT_study.pkl",'rb')
#study = pickle.load(file)
#file.close()

In [None]:
import plotly.io as pio
pio.renderers.default = "colab"
# pio.renderers.default = "iframe"

In [None]:
print(study.best_params)

{'learning_rate': 0.00033358137370955484, 'weight_decay': 0.23163923816605497}


In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
hyperparameters = ['learning_rate', 'weight_decay']
fig = optuna.visualization.plot_parallel_coordinate(study, params=hyperparameters)
fig.show()
fig.write_image(f"{run_id}_parallel_coordinate.png")

In [None]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()
fig.write_image(f'{run_id}_param_importances.png')

In [None]:
def save_object(obj, filename):
    with open(filename, 'wb') as outp:
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)


save_object(study, f'{run_id}_HYPEROPT_study.pkl')