In [2]:
# !pip install adapter-transformers
# !pip install torch
# !pip install pandas
# !pip install keras
# !pip install datasets
# !pip install tensorflow
# !pip install sklearn
# !pip install wandb

In [3]:
import torch
data_path = "/home/lieberze/DP/Thesis/model_training/data/512_bp_for_encoding/NEW/All_200k_tail_for_tokenizer.txt"       
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# https://huggingface.co/docs/tokenizers/python/latest/api/reference.html#module-tokenizers.processors
from tokenizers import ByteLevelBPETokenizer
from tokenizers import Tokenizer

path = "/home/lieberze/DP/Thesis/tokenizery_new_data/02_ByteLevelBPE/All_genomes_sample/All_512/5000/"
tokenizer = ByteLevelBPETokenizer(
    path + "vocab.json",
    path + "merges.txt",
)

# https://huggingface.co/docs/transformers/preprocessing
from tokenizers.processors import BertProcessing
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>"))
)

tokenizer.save("byte-level-bpe.tokenizer.json", pretty=True)

from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained(path)
tokenizer.vocab_size

file /home/lieberze/DP/Thesis/tokenizery_new_data/02_ByteLevelBPE/All_genomes_sample/All_512/5000/config.json not found


5000

In [5]:
id2label = {id:label for id, label in enumerate(["exon", "other"])}
label2id = {label:id for id,label in id2label.items()}
id2label, label2id

({0: 'exon', 1: 'other'}, {'exon': 0, 'other': 1})

In [6]:
# a = "exon"
# label2id[a]

In [7]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from keras.preprocessing.sequence import pad_sequences
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset

# defining the Dataset class
class data_set(Dataset):
    def __init__(self, data, labels, tokenizer):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        seq = self.data[index]
        lab = self.labels[index]
        lab_id = label2id[lab]
        tokenized = tokenizer(seq, max_length=128, padding="max_length", truncation=True)   
        tokenized_with_label = tokenized
        tokenized_with_label["labels"] = lab_id
        return tokenized_with_label
    
df = pd.read_csv(data_path, sep="\t", names=['type','sequence'])
subs = {
        "exon": "exon", 
        "intron": "other",
        "intergenic": "other"
        }
y = [subs.get(item) for item in df["type"]]
df["type"] = y
dataset = data_set(df["sequence"],df["type"], tokenizer)

dataloader = DataLoader(dataset, batch_size=4, shuffle=True) #, collate_fn=collate_tokenize) #, collate_fn=lambda x: x )
data = next(iter(dataloader))
# data

In [7]:
# for epoch in range(2):
#     print("\n==============================\n")
#     print("Epoch = " + str(epoch))
#     for (batch_idx, batch) in enumerate(dataloader):
#         print("\nBatch = " + str(batch_idx))
#         print(batch)
#         break

In [8]:
from sklearn.model_selection import train_test_split

df_train, df_eval = train_test_split(df, test_size=0.25, random_state=42, stratify=df["type"])
df_train, df_holdout = train_test_split(df_train, test_size=0.1, random_state=42, stratify=df_train["type"])

df_train = df_train.reset_index()
df_eval = df_eval.reset_index()
df_holdout = df_holdout.reset_index()

data_set_train = data_set(df_train["sequence"],df_train["type"], tokenizer)
data_set_eval = data_set(df_eval["sequence"],df_eval["type"], tokenizer)
# nesahat :)
holdout_test = data_set(df_holdout["sequence"],df_holdout["type"], tokenizer)

In [9]:
eval_size = len(df_eval)
number_of_epochs = 6
WANTED_eval_data_points_ratio = 0.1
    # proportionally to the whole dataset size. e.g. 0.1 == 10% of all optimization
    # steps is going to have an evaluation datapoint (loss)

train_batch_size = eval_batch_size = 64
total_optimization_steps = len(df_train)/train_batch_size
eval_steps_in_one_run = eval_size/(number_of_epochs*train_batch_size)
eval_steps_to_set=int(round(total_optimization_steps*WANTED_eval_data_points_ratio, 0))

print("total_optimization_steps:",total_optimization_steps,
      "\nhow many 'eval_steps' to set:",eval_steps_to_set,
      "\nhow many steps in each evaluation stop:",eval_steps_in_one_run,
      "\nhence in total:", eval_steps_to_set*eval_steps_in_one_run, "steps for the whole evaluation" )

total_optimization_steps: 2109.375 
how many 'eval_steps' to set: 211 
how many steps in each evaluation stop: 130.20833333333334 
hence in total: 27473.958333333336 steps for the whole evaluation


In [10]:
from transformers import AdapterTrainer, AutoModelWithHeads #TrainingsArguments
from transformers.training_args import TrainingArguments
# https://docs.adapterhub.ml/training.html
# https://discuss.huggingface.co/t/keyerror-loss-while-training-qna/4111
# https://huggingface.co/docs/transformers/main_classes/trainer

model = AutoModelWithHeads.from_pretrained('roberta-trained-new-tokenizer_params_1')
adapter_name = "LR_3e-4_NEW_200k_new_tokenizer"
model.add_adapter(adapter_name)
model.add_classification_head(adapter_name, num_labels=2, id2label = id2label) #, multilabel=False)
model.train_adapter(adapter_name) # inicializace

training_args =  TrainingArguments(
    learning_rate=3e-4,
    num_train_epochs=number_of_epochs,
    report_to="wandb",
    output_dir = "adapter_dir_NEW",
    label_names = ["exon", "other"],
    eval_steps = eval_steps_to_set, 
    evaluation_strategy="steps",
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
)

from sklearn.metrics import accuracy_score
from transformers import EvalPrediction
def compute_acc(p: EvalPrediction):
    preds, labels = p
    acc = accuracy_score(labels, preds)
    return {"acc": acc}

model.metrics=['accuracy'] #optimizer=opt, loss=loss,

trainer = AdapterTrainer(
        model=model,
        args=training_args,
        train_dataset=data_set_train,
        eval_dataset=data_set_eval,
        compute_metrics = compute_acc,
        # tokenizer=tokenizer, # data uz jsou ztokenizovana, netreba
        # collator netreba
    )

Some weights of the model checkpoint at roberta-trained-new-tokenizer_params_1 were not used when initializing RobertaModelWithHeads: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-trained-new-tokenizer_params_1 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream t

In [11]:
# for batch in trainer.get_train_dataloader():
#     break
# batch = {k: v.cuda() for k, v in batch.items()}
# outputs = trainer.model(**batch)
# batch, outputs

In [12]:
trainer.train()

***** Running training *****
  Num examples = 135000
  Num Epochs = 6
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 192
  Gradient Accumulation steps = 1
  Total optimization steps = 4224
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mliebelife[0m (use `wandb login --relogin` to force relogin)


RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 964, in forward
    adapter_names=adapter_names,
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 888, in forward
    **kwargs,
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 550, in forward
    **kwargs,
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 469, in forward
    self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output, **kwargs
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/transformers/modeling_utils.py", line 2338, in apply_chunking_to_forward
    return forward_fn(*input_tensors, **kwargs)
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 480, in feed_forward_chunk
    intermediate_output = self.intermediate(attention_output)
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/transformers/models/roberta/modeling_roberta.py", line 377, in forward
    hidden_states = self.intermediate_act_fn(hidden_states)
  File "/home/lieberze/.conda/envs/dp/lib/python3.6/site-packages/torch/nn/functional.py", line 1556, in gelu
    return torch._C._nn.gelu(input)
RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 10.76 GiB total capacity; 1.92 GiB already allocated; 20.56 MiB free; 1.93 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [None]:
trainer.save_model("adapter-sequence-types-NEW")

In [None]:
# model.save_adapter("./adapter-sequence-types/", adapter_name) # jen adapter

In [None]:
# pak mu dam holdout a udelam confusion matrix, model.predict. vyvazeny data
# list s predikcema, true values a pak conf_matice

In [None]:
model = AutoModelWithHeads.from_pretrained("roberta-trained-new-tokenizer_params_1")
tokenizer = tokenizer

adapter1 = model.load_adapter("adapter-sequence-types-NEW/LR_3e-4_NEW_200k_new_tokenizer")

# model.active_adapters = ac.Parallel(adapter1)
model.active_adapters = adapter1

# input_ids = tokenizer(a, return_tensors="pt")
# print("STS-B adapter output:", output1)
# print("MRPC adapter output:", bool(torch.argmax(output1[0]).item()))

In [None]:
# wandb.finish()

In [None]:
# import transformers.adapters.composition as ac
# model = AutoModelWithHeads.from_pretrained('roberta-trained')
# model.add_adapter("adapter-sequence-types")
# model.active_adapters = ac.Stack("adapter-sequence-types")

In [None]:
# model.to("cuda:0")
# trainer.evaluate()

training_args.device.indexhttps://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb

hlavne konec - evaluace a confusion matrix

In [None]:
from transformers import TextClassificationPipeline
classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=training_args.device.index)
# classifier(a)

In [None]:
# model.to(torch.device("cuda:0"))

In [None]:
sequences = list(df_holdout.sequence)
true_labels = list(df_holdout.type)
pred_labels = classifier(sequences)

In [None]:
# pred_labels = []
# for sequence in sequences:
#     pred_label = classifier(sequence)
#     pred_labels.append(pred_label)

In [None]:
pred_lab = [i["label"] for i in pred_labels]
pred_score = [i["score"] for i in pred_labels]

In [None]:
pred_labels[:50]

tady je videt, ze jakmile je to exon, tak si je hodne jistej => nejspis se uci podle delky :/

In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns

In [None]:
id2label

In [None]:
cm = metrics.confusion_matrix(true_labels, pred_lab, normalize='true')

# confusion matrix
plt.figure(figsize=(8, 6))
df_cm = pd.DataFrame(cm)
df_cm.columns = ['exon', 'other']
df_cm.index = ['exon', 'other']
plt.title('Confusion Matrix, normalized', size=16)
sns.heatmap(df_cm, annot=True, cmap='Blues')

plt.savefig('200k-NEW-all_yellow.png')
plt.show()

In [None]:
report = metrics.classification_report(true_labels, pred_lab, digits=2, output_dict=True, zero_division=0)

In [None]:
df_report = pd.DataFrame(report).transpose()
df_report
# SHOW THE DIFFERENCES
# exon, intron, intergenic = df_report["f1-score"].exon, df_report["f1-score"].other
# exon_vs_rest_ratio = exon/(intron + intergenic)
# exon_vs_rest_f1_score = exon, intron + intergenic, exon_vs_rest_ratio
print(df_report)

In [None]:
# exon_vs_rest_f1_score # (exon, intron+intergenic, their ratio)
# print("ratio of exon vs rest success:", exon_vs_rest_f1_score[2]) # zamerujeme se na uspech predikce exonu

u nestejne dlouhych - horsi nez ML modely = 0.77 (mely 0.81 az 0.83), ale ted, po uprave delek je to lepsi 0.86 (zkouseno na 20k vzorku)))))

0.72 pro natrenovane na kratkych, ale adapter na 100k dlouhych
0.77 pro natrenovane na kratkych a adapter taky na nich

In [None]:
import wandb
wandb.login()
wandb.finish()

In [None]:
# outputs = trainer.predict(test_ds) # test_dataset

In [None]:
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# y_true = outputs.label_ids
# y_pred = outputs.predictions.argmax(1)

# labels = train_ds.features['label'].names
# cm = confusion_matrix(y_true, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
# disp.plot(xticks_rotation=45)

In [None]:
# the correct way to predict with a trained model is prediction = model(tokenized_sequence_to_classify)

In [None]:
# output1 = model(**input_ids)
# torch.argmax(output1[0])

In [None]:
# model.load_adapter('roberta-trained', "./adapter-sequence-types/")

In [None]:
# import transformers.adapters.composition as ac

In [None]:
# model.load_adapter("adapter-sequence-types")
# model.set_active_adapters("adapter-sequence-types")
# model.predict() # pozor! asi potreba male batche, test set (holdout)
# asi bude potreba pushnout data na gpu rucne:
# model.to("cuda")

In [None]:
# #initializing the model
# model = MLP().to(device)