# Optimization for adapter batch size

In [16]:
suffix = "3x3" # multilabel classification

In [17]:
# !pip install adapter-transformers
# !pip install torch
# !pip install pandas
# !pip install keras
# !pip install datasets
# !pip install tensorflow
# !pip install sklearn
# !pip install wandb

In [18]:
import torch
import pickle
data_path = "/home/lieberze/DP/Thesis/05_model_training/data/512_bp_for_encoding/NEW/All_100k.txt"       
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#module-tokenizers.processors
path = "/home/lieberze/DP/Thesis/02_tokenizery_new_data/02_ByteLevelBPE/All_genomes_sample/All_512/5000/"

from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained(path)
tokenizer.vocab_size

file /home/lieberze/DP/Thesis/02_tokenizery_new_data/02_ByteLevelBPE/All_genomes_sample/All_512/5000/config.json not found


5000

In [19]:
print(tokenizer)

PreTrainedTokenizer(name_or_path='/home/lieberze/DP/Thesis/02_tokenizery_new_data/02_ByteLevelBPE/All_genomes_sample/All_512/5000/', vocab_size=5000, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})


In [20]:
id2label = {id:label for id, label in enumerate(["exon", "intron", "intergenic"])}
label2id = {label:id for id,label in id2label.items()}
id2label, label2id

({0: 'exon', 1: 'intron', 2: 'intergenic'},
 {'exon': 0, 'intron': 1, 'intergenic': 2})

In [21]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset

# defining the Dataset class
# there is also a method set_format (columns, ...)
class data_set(Dataset):
    def __init__(self, data, labels, tokenizer):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        seq = self.data[index]
        lab = self.labels[index]
        lab_id = label2id[lab]
        tokenized = tokenizer(seq, max_length=128, padding="max_length", truncation=True)   
        tokenized_with_label = tokenized
        tokenized_with_label["labels"] = lab_id # possible arguments are: input_ids, attention_mask, labels
        return tokenized_with_label
    
df = pd.read_csv(data_path, sep="\t", names=['type','sequence'])
dataset = data_set(df["sequence"],df["type"], tokenizer)

In [22]:
from sklearn.model_selection import train_test_split

df_train, df_eval = train_test_split(df, test_size=0.15, random_state=42, stratify=df["type"])
df_train, df_holdout = train_test_split(df_train, test_size=0.1, random_state=42, stratify=df_train["type"])

df_train = df_train.reset_index()
df_eval = df_eval.reset_index()
df_holdout = df_holdout.reset_index()

data_set_train = data_set(df_train["sequence"],df_train["type"], tokenizer)
data_set_eval = data_set(df_eval["sequence"],df_eval["type"], tokenizer)
# don't touch :)
holdout_test = data_set(df_holdout["sequence"],df_holdout["type"], tokenizer)

In [23]:
import os
Transformers = "/home/lieberze/DP/Thesis/05_model_training/"
TransformerName = os.path.abspath(os.path.join(Transformers, "roberta-trained-new-tokenizer_params_4")) 
# the correct way to predict with a trained model is prediction = model(tokenized_sequence_to_classify)

SaveToFolder = "adapter_optimize"
LabelNames = ["exon", "intron", "intergenic"]
num_labels = len(LabelNames)

In [28]:
from transformers import AdapterTrainer, AutoModelWithHeads #TrainingsArguments
from transformers.training_args import TrainingArguments
import numpy as np
import pandas as pd
import wandb
    # https://docs.adapterhub.ml/training.html
    # https://discuss.huggingface.co/t/keyerror-loss-while-training-qna/4111
    # https://huggingface.co/docs/transformers/main_classes/trainer
    
number_of_epochs = 5
num_GPU = 3
WANTED_eval_data_points_ratio = 0.1

def TrainAdapter(ModelName, SaveToFolder, AdapterName, LR, WR, AE, TBS):
    wandb.init(project="adapter_optimize")
               
    model = AutoModelWithHeads.from_pretrained(ModelName)
    adapter_name = AdapterName
    model.add_adapter(adapter_name)
    model.add_classification_head(adapter_name, num_labels=num_labels, id2label = id2label) #, multilabel=False)
    model.train_adapter(adapter_name) # inicializace
    
    # train_batch_size = eval_batch_size = 64
    total_optimization_steps = round(len(df_train)/TBS/num_GPU)*number_of_epochs
    eval_steps_in_one_run = round(len(df_eval)/(number_of_epochs*TBS)/num_GPU) 
    eval_steps_to_set=int(round(total_optimization_steps*WANTED_eval_data_points_ratio, 0))
    print("total_optimization_steps:",total_optimization_steps,
          "\nhow many 'eval_steps' to set:",eval_steps_to_set,
          "\nhow many steps in each evaluation stop:",eval_steps_in_one_run,
          "\nhence in total:", eval_steps_to_set*eval_steps_in_one_run, "steps for the whole evaluation" )

    training_args =  TrainingArguments(
        learning_rate=LR,
        num_train_epochs=number_of_epochs,
        report_to="wandb",
        output_dir = SaveToFolder,
        label_names = LabelNames,
        eval_steps = eval_steps_to_set, 
        evaluation_strategy="steps",
        per_device_train_batch_size=TBS, # poladit (32, 64, 128)
        per_device_eval_batch_size=TBS,
        do_eval=True,
        logging_steps=total_optimization_steps*0.05, 
        warmup_ratio = WR,
        adam_epsilon=AE,
        weight_decay=0.0005,
        save_steps=10_000,
        save_total_limit=2,
        # prediction_loss_only # rozhodne ne!
         # The next line is important to ensure the dataset labels are properly passed to the model
        remove_unused_columns=False,
        seed=1,
    )

    from sklearn import metrics
    from sklearn.metrics import accuracy_score, f1_score
    from transformers import EvalPrediction
    
    def compute_metrics(p: EvalPrediction):
        logits, labels = p
        preds = np.argmax(logits, axis=-1)
        acc = accuracy_score(labels, preds)
        f1_weighted = f1_score(labels, preds, average='weighted')
        
        report = metrics.classification_report(labels, preds, digits=2, output_dict=True, zero_division=0)
        df_report = pd.DataFrame(report).transpose()
        return {"acc": acc, "f1_weighted": f1_weighted, "report":df_report}

    model.metrics = ['f1_weighted'] #optimizer=opt, loss=loss,

    trainer = AdapterTrainer(
            model=model,
            args=training_args,
            train_dataset=data_set_train,
            eval_dataset=data_set_eval,
            compute_metrics = compute_metrics,
        )
    
    # first train model, save results
    result = trainer.train()
    
    # evaluation metrics, manually, because (for unknown reason) the model itself doesn't return them
    y_pr = data_set_eval.labels
    y_pr = [label2id[i] for i in y_pr]
    loss = trainer.compute_metrics([trainer.predict(data_set_eval).predictions[1], y_pr])
    Metrics = trainer.evaluate()
    
    print(loss, Metrics)
    !mkdir -p $SaveToFolder/$AdapterName
    
    # saving results
    with open(f"{SaveToFolder}/{AdapterName}/metrics_{suffix}.pkl", "wb") as f:
        pickle.dump({"loss":loss, "metrics":Metrics, "result":result}, f)

    trainer.save_model(SaveToFolder)
    wandb.finish()

In [None]:
# TRY:
choices = [[0.0003, 0.08, 1e-08, 4],
           [0.0003, 0.08, 1e-08, 16],
           [0.0003, 0.05, 1e-08, 32],
           [0.0003, 0.05, 1e-08, 64],]
            # [0.0003, 0.05, 1e-08, 128]] # killed job (too big for memory)

for i,choice in enumerate(choices):
    LR, WR, AE, TBS = choice
    TrainAdapter(TransformerName, SaveToFolder, f"adapter_batchsize_{TBS}", LR, WR, AE, TBS)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mliebelife[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Some weights of the model checkpoint at /home/lieberze/DP/Thesis/05_model_training/roberta-trained-new-tokenizer_params_4 were not used when initializing RobertaModelWithHeads: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at /home/lieberze/DP/Thesis/05_model_training/roberta-trained-new-tokenizer_params_4 and are newly initialized: ['roberta.pooler.dense.bias', 

total_optimization_steps: 2215 
how many 'eval_steps' to set: 222 
how many steps in each evaluation stop: 16 
hence in total: 3552 steps for the whole evaluation


***** Running training *****
  Num examples = 85000
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 2215
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss


In [25]:
batch_sizes = [i[-1] for i in choices]
batch_sizes

[4, 16, 32, 64]

This may or may not be necessary

In [None]:
# model.save_adapter("./adapter-sequence-types/", adapter_name) # saving only the adapter

## Loading the adapter + transformer and prediction

In [None]:
from transformers import TextClassificationPipeline
import pickle
from sklearn import metrics

for batchsize in batch_sizes[]:
    print(f"processing batchsize {batchsize}")
    adapter_batchsize = batchsize
    model = AutoModelWithHeads.from_pretrained(TransformerName)
    adapter1 = model.load_adapter(f"{SaveToFolder}/adapter_batchsize_{adapter_batchsize}")
    model.active_adapters = adapter1
    
    sequences = list(df_holdout.sequence)
    true_labels = list(df_holdout.type)

    classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)#, device=training_args.device.index)

    ## this takes a lot of time to compute so it is better to save the output for future use
    pred_labels = classifier(sequences)
    with open(f"{SaveToFolder}/adapter_batchsize_{adapter_batchsize}/pred_labels_{suffix}.pk", 'wb') as f:
        pickle.dump(pred_labels, f)

    pred_lab = [i["label"] for i in pred_labels]
    pred_score = [i["score"] for i in pred_labels]

    print(pred_labels[:50])
    
    report = metrics.classification_report(true_labels, pred_lab, digits=2, output_dict=True, zero_division=0)
    df_report = pd.DataFrame(report)
    df_report.to_pickle(f"{SaveToFolder}/adapter_batchsize_{adapter_batchsize}/report_{suffix}.pkl")

processing batchsize 16


Some weights of the model checkpoint at /home/lieberze/DP/Thesis/05_model_training/roberta-trained-new-tokenizer_params_4 were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at /home/lieberze/DP/Thesis/05_model_training/roberta-trained-new-tokenizer_params_4 and are newly initialized: ['roberta.pooler.dense.weight'

[{'label': 'intron', 'score': 0.43355241417884827}, {'label': 'intron', 'score': 0.4161914587020874}, {'label': 'exon', 'score': 0.9289034008979797}, {'label': 'intergenic', 'score': 0.37669631838798523}, {'label': 'intergenic', 'score': 0.4883517920970917}, {'label': 'intron', 'score': 0.5470800399780273}, {'label': 'intron', 'score': 0.4596070349216461}, {'label': 'intergenic', 'score': 0.5114262700080872}, {'label': 'intron', 'score': 0.4685747027397156}, {'label': 'intron', 'score': 0.41981571912765503}, {'label': 'intergenic', 'score': 0.4568425118923187}, {'label': 'exon', 'score': 0.3957570791244507}, {'label': 'exon', 'score': 0.6212294101715088}, {'label': 'exon', 'score': 0.9940525889396667}, {'label': 'intron', 'score': 0.520086407661438}, {'label': 'intergenic', 'score': 0.5287533402442932}, {'label': 'intron', 'score': 0.4231276512145996}, {'label': 'exon', 'score': 0.45736563205718994}, {'label': 'intergenic', 'score': 0.41922834515571594}, {'label': 'intron', 'score': 0.

Some weights of the model checkpoint at /home/lieberze/DP/Thesis/05_model_training/roberta-trained-new-tokenizer_params_4 were not used when initializing RobertaModelWithHeads: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at /home/lieberze/DP/Thesis/05_model_training/roberta-trained-new-tokenizer_params_4 and are newly initialized: ['roberta.pooler.dense.weight'

## Load all classification reports for batch optimization

In [42]:
for batchsize in batch_sizes:
    print(f"adapter with batch size: {batchsize}")
    with open(f"{SaveToFolder}/adapter_batchsize_{batchsize}/report_{suffix}.pkl", 'rb') as f:
        x = pickle.load(f)
    print(x)

adapter with batch size: 4
                  exon   intergenic       intron  accuracy    macro avg  \
precision     0.760353     0.522990     0.476809  0.581176     0.586717   
recall        0.696278     0.593118     0.453583  0.581176     0.580993   
f1-score      0.726906     0.555851     0.464906  0.581176     0.582554   
support    2848.000000  2819.000000  2833.000000  0.581176  8500.000000   

           weighted avg  
precision      0.587129  
recall         0.581176  
f1-score       0.582853  
support     8500.000000  
adapter with batch size: 16
                  exon   intergenic       intron  accuracy    macro avg  \
precision     0.757104     0.518725     0.457915  0.570118     0.577915   
recall        0.682935     0.560128     0.466643  0.570118     0.569902   
f1-score      0.718110     0.538632     0.462238  0.570118     0.572993   
support    2848.000000  2819.000000  2833.000000  0.570118  8500.000000   

           weighted avg  
precision      0.578328  
recall     