In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '5'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.insert(1, '../../src')

import torch
import evaluate
import mlflow
import numpy as np
import pandas as pd
from torch import nn
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    DataCollatorForLanguageModeling,
    AutoModelForMaskedLM
)

from TextProcessor import TextProcessor
from DatasetProcessor import DatasetProcessor


In [4]:
mlflow.set_tracking_uri("http://mlflow:5000")
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))

os.environ["MLFLOW_EXPERIMENT_NAME"] = "NER"
# os.environ["HF_MLFLOW_LOG_ARTIFACTS"] = "True"

# os.environ["MLFLOW_FLATTEN_PARAMS"] = "True"

# os.environ["MLFLOW_TRACKING_URI"] = tracking_uri

Current tracking uri: http://mlflow:5000


## Train MLM

In [5]:
DATA_PATH = '../../dataset/RuMedPrimeData.tsv'

df = pd.read_csv(DATA_PATH, sep='\t')
df = df[['symptoms', 'anamnesis']]

In [6]:
texts = []
texts.extend(df['symptoms'].tolist())
texts.extend(df['anamnesis'].tolist())
len(texts)

15250

In [7]:
dataset = Dataset.from_dict({'text': texts})
dataset = dataset.train_test_split(test_size=0.05)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 14487
    })
    test: Dataset({
        features: ['text'],
        num_rows: 763
    })
})

In [8]:
model_name = "alexyalunin/RuBioRoBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [9]:
def tokenize_function(examples):
    result = tokenizer(examples["text"],
                    #    truncation=True, max_length=512, padding=True
                       )
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

Map:   0%|          | 0/14487 [00:00<?, ? examples/s]

Map:   0%|          | 0/763 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 14487
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 763
    })
})

In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [11]:
chunk_size = 512

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [12]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/14487 [00:00<?, ? examples/s]

Map:   0%|          | 0/763 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 2453
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 117
    })
})

In [13]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> <s> на перебои в работе сердца, од<mask><mask>анного характера при<mask>, общую слабость.</s><s> Жал<mask> на ноющие боли внизу живота в течение 5-ти дней. УЗИ ОМТ от<mask>ДАТА<mask>: Множественная миома матки малых размеров, субму<mask>озынй рост<mask><mask> узлов. Наличие жидкостных образований в правом яичнике<mask>19.7 и 23<mask>9 мм).</s><s> Считает себя больной с *ДАТА<mask> (55 лет),<mask> после перенес<mask> стресса появилось дрожание правой руке. Изменился почерк.</s><s> больна в течении года.<mask>рт: единичные очаги дисциркуляции киста эпифиза.</s><s> Симптомы<mask><mask>ивируют 7 дней<mask> без<mask> с чем-либо.Ела суши. Не обследована</s><s> больна винк<mask>го времени( привела родственница) стала странно себя вести. мрт: нейродегенерация церебр<mask> имкроангиопат<mask><mask> участки г растворилсяоз<mask> трансформации глубинных отделов правой гемисферы конв<mask><mask>альных дополнение теменных долей с обеих сторон в исходе перенесенных онм<mask>( сосудистых повреж

In [14]:
import collections
import numpy as np

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return data_collator(features)

In [15]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s> на перебои<mask><mask><mask>, одышку<mask><mask> характера<mask> ФН, общую слабость<mask></s><s><mask><mask> на ноющие боли внизу живота в течение<mask>-ти<mask>. УЗИ ОМТ от *ДАТА*: Множественная<mask>ома<mask>ки малых размеров,<mask><mask><mask><mask><mask><mask> рост<mask> из узлов. Наличие жидкостных образований<mask> правом яич<mask> (19.<mask> и 23.9<mask>).</s><s> Считает<mask> больной<mask><mask>ДАТА* (55 лет), когда после<mask><mask><mask><mask> дрожание<mask> руке<mask><mask>ился<mask>.</s><s><mask> в<mask><mask> = мрт: единичные<mask>аги дис<mask>куляции киста<mask>ифиза.</s><s> Симптомы рецидивируют<mask><mask>, Реж связи<mask> чем<mask>либо.Ела суши. Не обследована</s><s> больна в<mask><mask><mask> времени<mask> привела<mask><mask>)<mask> странно себя вести<mask> мрт<mask><mask>родег<mask>ция цер<mask>ральная<mask><mask><mask><mask><mask><mask><mask>. участки<mask><mask><mask><mask> трансформации глубинных отделов правой гемисферы конвекситальных отделов<mask><mas

In [16]:
class LoRALayer(nn.Module):
    """Wraps a linear layer with LoRA-like adapter. Wraps an existing OPT linear layer"""
    def __init__(self, module: nn.Linear, rank: int):
        super().__init__()
        self.module = module
        self.adapter = nn.Sequential(
            nn.Linear(module.in_features, rank, bias=False),
            nn.Linear(rank, module.out_features, bias=False)
        )
        nn.init.kaiming_uniform_(self.adapter[0].weight, a=5 ** 0.5)
        nn.init.zeros_(self.adapter[1].weight)

        self.adapter.to(module.weight.device)

    def forward(self, input):
        # Apply self.module and LoRA adapter, return the sum (base module outputs + adapter outputs)
        return self.module(input) + self.adapter(input)
    

model = AutoModelForMaskedLM.from_pretrained(
    model_name,
    device_map={'': torch.cuda.current_device()},
    cache_dir='.cache',
    )

for param in model.roberta.parameters():
    param.requires_grad = False

lora_rank = 128
for name, module in model.roberta.named_modules():
    if 'RobertaSelfAttention' in repr(type(module)):
        module.query = LoRALayer(module.query, rank=lora_rank)
        module.key = LoRALayer(module.key, rank=lora_rank)
        module.value = LoRALayer(module.value, rank=lora_rank)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at alexyalunin/RuBioRoBERTa and are newly initialized: ['lm_head.decoder.bias', 'lm_head.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
cur_run_id = 2
num_train_epochs = 30
batch_size = 16

name = "RuBioRoBERTa-LoRA-MLM"
run_name = f'{name}-{cur_run_id:02}'
output_dir = f'./logs/{run_name}'


training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_steps=1,
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to="mlflow",
    run_name=run_name
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
)

trainer.train()
mlflow.end_run()



Epoch,Training Loss,Validation Loss
1,1.2777,1.243954
2,0.9152,1.158979
3,1.0068,1.041717
4,1.0498,1.031428
5,1.0381,1.03832
6,1.0568,1.014725
7,1.1477,0.984178
8,1.0604,0.989031
9,1.1368,0.989673
10,1.072,0.960911


In [19]:
path_to_model = f'../../models/{run_name}.pt'
torch.save(model, path_to_model)