# Fine-tune masked ruBERT-tiny2

Загружаем библиотеки

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, AutoModel
from datasets import Dataset
import evaluate

Загружаем отобранные предложения. Предложения отбираются в скрипте precorrect.py

In [22]:
with open('CorrectDN.json', encoding="utf-8") as f:
    sentences = pd.read_json(f, orient='records')
sentences.tail()

Unnamed: 0,text,label
1042,"В Саратове при пожаре погиб мужчина, спасшийся...",1
1043,Лаврентьев: США приказали оппозиции в Сирии на...,1
1044,Лаврентьев рассказал о контактах спецслужб Рос...,1
1045,В Ростове-на-Дону арестовали двух украинских а...,1
1046,Посол США обвинил Орбана в пророссийской внешн...,1


Собираем dataset

In [23]:
raw_ds = Dataset.from_pandas(sentences, preserve_index=False)
raw_ds = raw_ds.class_encode_column('label')
raw_ds.features

Stringifying the column:   0%|          | 0/1047 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1047 [00:00<?, ? examples/s]

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['1'], id=None)}

Делим dataset на train и test (80:20)

In [24]:
raw_ds = raw_ds.train_test_split(test_size=0.2, shuffle=True)

In [25]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

model_checkpoint = "cointegrated/rubert-tiny2"
# model_checkpoint = "DeepPavlov/rubert-base-cased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [26]:
text = "После начала российской военной [MASK] на Украине западные страны усилили санкционное давление на Москву"

In [47]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
#print (inputs)
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
# print (mask_token_index)
mask_token_logits = token_logits[0, mask_token_index, :]

# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

word = "спецоперации"
iii = tokenizer(word, return_tensors="pt")
moscow_token = iii["input_ids"][0][1].item()
print(tokenizer.mask_token_id)
print (moscow_token)

probs = torch.nn.functional.softmax(mask_token_logits, dim=1)
#print (probs[0][29571])
#print (probs[0][19534])
print (probs[0][moscow_token])

top_5_prob = torch.topk(probs, 5, dim=1)
val = top_5_prob.values[0].tolist()
ind = top_5_prob.indices[0].tolist()
for i in range(len(val)):
    print (f"'>>> {tokenizer.decode([ind[i]])} ({ind[i]})  >>>>: {val[i]}")
print ("-----------------")



# for i in range(len(val_all)):
#     if ind[i] == 17627:
#         print (f"'>>> {tokenizer.decode([ind_all[i]])}  >>>>: {val_all[i]}")

#for token in top_5_tokens:
#    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}' {token}")

4
61407
tensor(7.1256e-07, grad_fn=<SelectBackward0>)
'>>> службы (10969)  >>>>: 0.10826621949672699
'>>> операции (11394)  >>>>: 0.07997772842645645
'>>> политики (17480)  >>>>: 0.0775214359164238
'>>> силы (11885)  >>>>: 0.06181337684392929
'>>> кампании (22938)  >>>>: 0.046161700040102005
-----------------


Tokenized dataset

In [28]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = raw_ds.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/837 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 837
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 210
    })
})

Chank size (Parameter!!!)

In [29]:
chunk_size = 128

In [30]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [48]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

print (lm_datasets["test"])
print (lm_datasets["test"]["input_ids"])
print (lm_datasets["test"]["token_type_ids"])
print (lm_datasets["test"]["attention_mask"])
print (lm_datasets["test"]["word_ids"])
print (lm_datasets["test"]["labels"])

Map:   0%|          | 0/837 [00:00<?, ? examples/s]

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 371
})
[[2, 1041, 62861, 626, 75498, 46795, 40258, 6585, 310, 29539, 35266, 17, 64055, 9722, 49103, 36698, 41254, 16, 665, 28045, 887, 1, 31721, 31016, 18, 30650, 20252, 6680, 30813, 314, 4859, 71360, 3933, 548, 16813, 62861, 603, 75498, 46795, 40258, 6585, 6, 299, 59522, 2462, 6, 16, 29887, 769, 37251, 16, 29819, 31721, 31016, 314, 47470, 21179, 18, 7936, 33792, 16, 1046, 22333, 329, 66121, 16, 52230, 13787, 31772, 733, 77776, 37218, 16, 59818, 47089, 29292, 18, 30451, 33334, 47717, 27805, 30499, 6258, 18, 1041, 13164, 16, 52770, 314, 19298, 34047, 16, 31116, 68296, 34809, 16, 314, 4016, 778, 17, 650, 83660, 39393, 2749, 32035, 26601, 1464, 49791, 18, 3, 2, 65469, 721, 39116, 19209, 78030, 603, 32067, 314, 35638, 68663, 700, 329, 69025, 1854, 35746, 18, 42834], [1302, 47626, 320, 17597, 11866, 65469, 721, 39116, 19209, 78030, 603, 32067, 314, 35638, 56433, 764, 17, 32477, 314

Процент маскирования = 15% (Parameter!!!)

In [49]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

whole word masking, процент маскирования = 15% (Parameter!!!)

In [50]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.15


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

Маскирование всего набора за раз

In [51]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [52]:
lm_datasets = lm_datasets.remove_columns(["word_ids"])
eval_dataset = lm_datasets["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=lm_datasets["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
        "masked_token_type_ids": "token_type_ids"
    }
)
eval_dataset
print (eval_dataset["input_ids"])
print (eval_dataset["attention_mask"])
print (eval_dataset["labels"])
print (eval_dataset["token_type_ids"])

Map:   0%|          | 0/371 [00:00<?, ? examples/s]

[[2, 1041, 62861, 626, 75498, 46795, 40258, 6585, 310, 4, 4, 17, 64055, 4, 49103, 36698, 41254, 16, 665, 28045, 4, 1, 31721, 31016, 4, 30650, 20252, 6680, 30813, 314, 4859, 71360, 3933, 4, 16813, 62861, 603, 75498, 4, 40258, 6585, 6, 299, 59522, 2462, 6, 16, 29887, 769, 37251, 16, 29819, 31721, 31016, 314, 47470, 21179, 4, 7936, 33792, 4, 1046, 22333, 329, 4, 16, 4, 13787, 31772, 733, 77776, 37218, 16, 59818, 47089, 29292, 18, 30451, 33334, 47717, 27805, 30499, 6258, 18, 1041, 13164, 16, 52770, 314, 4, 34047, 16, 31116, 68296, 34809, 4, 314, 4016, 4, 17, 5080, 83660, 4, 2749, 32035, 26601, 1464, 49791, 18, 3, 2, 4, 4, 39116, 19209, 78030, 4, 32067, 314, 35638, 4, 700, 4, 69025, 1854, 35746, 18, 42834], [4, 4, 320, 17597, 11866, 65469, 721, 39116, 4, 78030, 603, 32067, 314, 35638, 56433, 764, 17, 32477, 314, 4, 1251, 21592, 16, 3771, 39334, 47198, 1142, 69025, 811, 16, 314, 2711, 26601, 52444, 1308, 31184, 16, 22502, 12376, 3943, 18, 1499, 4796, 18414, 16, 1079, 17, 26862, 37788, 59228,

inputs_ids содержит маскированные данные (токен маски = 4), labels содержит токены под маской

Batch_size (Parameter!!!)

In [16]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 16
train_dataloader = DataLoader(
    lm_datasets["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
    # collate_fn=whole_word_masking_data_collator
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

Optimizer (Parameter!!!)

In [17]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [18]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [19]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [20]:
output_dir = 'fine-train'

In [None]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)