This notebook contains the best results obtained with XLM-RoBERTa and the demonstration of its work.

In [58]:
pip install datasets wget pymorphy2 accelerate transformers tqdm deep_translator pdfminer-six

Collecting pdfminer-six
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer-six
Successfully installed pdfminer-six-20231228


In [59]:
# imports section

import pandas as pd
import numpy as np
import string
import torch
import random

import sklearn
import wget

import accelerate
from tqdm import tqdm
from torch.utils.data import TensorDataset

from transformers import RobertaConfig, RobertaModel, AutoModelForQuestionAnswering, AutoTokenizer, pipeline, XLMRobertaForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset, Dataset

from sklearn.model_selection import train_test_split
from deep_translator import GoogleTranslator

from pdfminer.high_level import extract_text

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
# fixing the random state

seed_val = 0

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# 1. Training an LLM for a context-based question answering task

## 1.1. Initial fine-tuning on the dataset

In [6]:
# the model is multilingual roberta: https://huggingface.co/docs/transformers/model_doc/xlm-roberta
# the logic is similar to https://amitnikhade.medium.com/question-answering-in-association-with-roberta-a11518e70507
# this tutorial is also used here: https://medium.com/@anyuanay/fine-tuning-the-pre-trained-bert-model-in-hugging-face-for-question-answering-8edc76890ce0

model_name = "deepset/roberta-base-squad2" # https://huggingface.co/deepset/xlm-roberta-base-squad2

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = XLMRobertaForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

You are using a model of type roberta to instantiate a model of type xlm-roberta. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [8]:
xquad_dataset = load_dataset('xquad', 'xquad.ru') # russian subset of xquad

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/322k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

In [9]:
xquad_dataset

DatasetDict({
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 1190
    })
})

In [10]:
data = pd.DataFrame(xquad_dataset['validation'])
data.head(3)

Unnamed: 0,id,context,question,answers
0,56beb4343aeaaa14008c925b,"﻿Защита Пэнтерс уступила всего 308 очков, заня...",Сколько очков уступила защита Пэнтерс?,"{'text': ['308'], 'answer_start': [31]}"
1,56beb4343aeaaa14008c925c,"﻿Защита Пэнтерс уступила всего 308 очков, заня...",Сколько мешков за карьеру было у Джареда Аллена?,"{'text': ['136'], 'answer_start': [495]}"
2,56beb4343aeaaa14008c925d,"﻿Защита Пэнтерс уступила всего 308 очков, заня...",Сколько блокировок записал на свой счет Люк Ки...,"{'text': ['118'], 'answer_start': [826]}"


In [11]:
data.shape

(1190, 4)

In [12]:
batch_size = 32 # 32 or 64 is a recommended size
max_length = 512 # max length of the model input
stride = 256 # 1/4 of max model input length is recommended for starters; needed to keep the information that doesn't fit in the model

In [13]:
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",  # truncate context, not the question, bc all questions fit into the max length
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True, # make sure no data is lost
        return_offsets_mapping=True, # start and end positions
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
    return tokenized_examples

In [14]:
# now splitting the data to train, validation and test sets
# the dataset is small, so I will have to work with small parts of it

xquad_dataset = xquad_dataset['validation'].train_test_split(test_size=0.2) # changed from 0.3
xquad_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 952
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 238
    })
})

In [15]:
train_set = xquad_dataset['train']
train_set

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 952
})

In [16]:
validation_set = xquad_dataset['test']
validation_set

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 238
})

In [17]:
validation_split_set = validation_set.train_test_split(test_size=0.5) # val and test
validation_split_set

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 119
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 119
    })
})

In [18]:
val_set = validation_split_set['train']
test_set = validation_split_set['test']

In [19]:
tokenized_train = train_set.map(prepare_train_features,
                                batched=True,
                                remove_columns=train_set.column_names)

Map:   0%|          | 0/952 [00:00<?, ? examples/s]

In [20]:
tokenized_train

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 4183
})

In [21]:
tokenized_val = val_set.map(prepare_train_features,
                                batched=True,
                                remove_columns=val_set.column_names)

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

In [23]:
# training arguments are simpler here, it gave better result
args = TrainingArguments(
    output_dir="finetune-xlm-roberta-base-xquad",
    evaluation_strategy = "epoch",
    num_train_epochs=3, # 3 is the default
    do_train=True, # default is False
    do_eval=True # same here
)

In [24]:
# Trainer (https://huggingface.co/docs/transformers/main_classes/trainer) does all the work
# here default settings are used

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer
)

In [25]:
model.to(device)

XLMRobertaForQuestionAnswering(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=

In [26]:
trainer.train() # takes about 20 minutes on T4

Epoch,Training Loss,Validation Loss
1,1.9306,1.704374
2,1.4013,1.653747
3,1.0172,1.769582


TrainOutput(global_step=1569, training_loss=1.4332521340131303, metrics={'train_runtime': 1366.4597, 'train_samples_per_second': 9.184, 'train_steps_per_second': 1.148, 'total_flos': 3279013000280064.0, 'train_loss': 1.4332521340131303, 'epoch': 3.0})

In [27]:
# evaluation

def evaluate_instance(instance, device):
    context = instance['context']
    question = instance['question']
    given_answer = instance['answers']['text'][0]
    inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        output = model(**inputs)
    start_idx = torch.argmax(output.start_logits)
    end_idx = torch.argmax(output.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))
    return predicted_answer.lower() == given_answer.lower()

In [28]:
correct_count = 0
total_count = test_set.shape[0] # number of rows of a test set

for i in range(total_count):
    correct_count += evaluate_instance(test_set[i], device)

In [29]:
accuracy = correct_count / total_count
print(f'Accuracy: {accuracy * 100:.2f}%') # even though the losses are low, accuracy is poorer than of RuBERT

Accuracy: 0.00%


## 1.2. Augmentation

In [40]:
def en_ru_translator(text):
    translator = GoogleTranslator(source='en', target='ru')
    return translator.translate(text)

In [35]:
# english dataset

en_xquad_dataset = load_dataset('xquad', 'xquad.en')

Downloading data:   0%|          | 0.00/212k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

In [36]:
data_en = pd.DataFrame(en_xquad_dataset['validation'])

In [37]:
# reinitializing the russian dataset, because it was changed by splitting
# and here we need it unchanged

xquad_dataset = load_dataset('xquad', 'xquad.ru')

In [38]:
data = pd.DataFrame(xquad_dataset['validation'])

In [39]:
tqdm.pandas() # progress bar

In [41]:
# translation (takes about 10 minutes in total)

data_en.loc[:, 'context'] = data_en.context.progress_apply(en_ru_translator)
data_en.loc[:, 'question'] = data_en.question.progress_apply(en_ru_translator)

100%|██████████| 1190/1190 [04:50<00:00,  4.10it/s]
100%|██████████| 1190/1190 [03:38<00:00,  5.44it/s]


In [42]:
# same for the answers

answers_list = data_en.answers.tolist()
translator = GoogleTranslator(source='en', target='ru')

for answer in answers_list:
  txt = answer['text'][0]
  t_txt = translator.translate(txt)
  answer['text'] = [t_txt]

data_en['answers'] = answers_list

In [43]:
# joining two datasets

augmented_data = pd.concat([data, data_en],
                           ignore_index=True,
                           axis=0)

In [44]:
augmented_dataset = Dataset.from_pandas(augmented_data) # to Dataset

In [45]:
X = augmented_dataset.train_test_split(test_size=0.2)
train = X['train']
validation = X['test']

validation_split = validation.train_test_split(test_size=0.5)
val = validation_split['train']
test = validation_split['test']

In [46]:
tokenized_train = train.map(prepare_train_features,
                            batched=True,
                            remove_columns=train.column_names)
tokenized_val = val.map(prepare_train_features,
                        batched=True,
                        remove_columns=val.column_names)

Map:   0%|          | 0/1904 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

In [47]:
args = TrainingArguments(
    output_dir="finetune-xlm-roberta-base-xquad",
    evaluation_strategy = "epoch",
    num_train_epochs=3, # 3 is the default
    do_train=True, # default is False
    do_eval=True # same here
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer
)

In [48]:
model.to(device)

XLMRobertaForQuestionAnswering(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=

In [49]:
trainer.train() # takes a long time (40 minutes approximately)
# does not require any API key

Epoch,Training Loss,Validation Loss
1,2.3675,2.106487
2,2.0019,2.090805
3,1.6701,2.150003


TrainOutput(global_step=2982, training_loss=1.9675954285282005, metrics={'train_runtime': 2601.9328, 'train_samples_per_second': 9.167, 'train_steps_per_second': 1.146, 'total_flos': 6232711538423808.0, 'train_loss': 1.9675954285282005, 'epoch': 3.0})

In [50]:
correct_count = 0
total_count = test.shape[0]

for i in range(total_count):
    correct_count += evaluate_instance(test[i], device)

In [51]:
accuracy = correct_count / total_count
print(f'Accuracy: {accuracy * 100:.2f}%') # a bit of improvement is here

Accuracy: 0.42%


# Search among files

In [52]:
model.eval() # to evaluation mode

XLMRobertaForQuestionAnswering(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=

In [53]:
# loading the samples

sample_doc_1_path = 'https://github.com/NataliyaPovarova/HSE_Thesis/blob/main/sample_doc_1.pdf?raw=true'
sample_doc_2_path = 'https://github.com/NataliyaPovarova/HSE_Thesis/blob/main/sample_doc_2.pdf?raw=true'
sample_doc_3_path = 'https://github.com/NataliyaPovarova/HSE_Thesis/blob/main/sample_doc_3.pdf?raw=true'

sample_doc_1 = wget.download(sample_doc_1_path)
sample_doc_2 = wget.download(sample_doc_2_path)
sample_doc_3 = wget.download(sample_doc_3_path)

In [54]:
# the function to embed the text

def get_embedding(text):
    inputs = tokenizer(text,
                       return_tensors="pt",
                       padding=True,
                       truncation=True,
                       max_length=512)
    outputs = model(**inputs.to(device), output_hidden_states=True)
    # use the mean of last layer hidden states as the embedding
    embeddings = outputs.hidden_states[-1].mean(1)
    return embeddings

In [55]:
# the function, which scanns all the files and looks for the most relevant one

def scanner(query, files_to_scan): # changed from scanner(query, path) -> for local / google drive
  cos = torch.nn.CosineSimilarity(dim=1).to(device) # using cosine similarity to sort the answers
  answers = [] # list to store the name of the document and relevant information in it

  # this section is to reproduce from local / google drive
  # files_in_folder = os.listdir(path)
  # files_to_scan = [file for file in files_in_folder if '.pdf' in file] # make sure we only work with pdfs now
  for f in files_to_scan:
    # text = extract_text(path + f, codec='utf-8') # read a file from drive
    text = extract_text(f, codec='utf-8') # read a file from list
    inputs = tokenizer(query,
                       text,
                       return_tensors='pt',
                       max_length=512,
                       truncation=True) # tokenize the query and the file text
    output = model(**inputs.to(device))
    start_idx = torch.argmax(output.start_logits)
    end_idx = torch.argmax(output.end_logits)
    # find the answer to the query in the document (if any)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))
    answers.append([f, predicted_answer]) # save
  # now calculating cosine similarity
  query_embedding = get_embedding(query) # now embedding to calculate similarity
  answers_embeddings = [] # same for the answers
  for answer in answers:
    answers_embeddings.append([answer[0], get_embedding(answer[1])])
  cosine_similarities = [] # calculating cosine similarities
  for ans in answers_embeddings:
    cosine_similarities.append([ans[0], cos(query_embedding, ans[1])])
  result = sorted(cosine_similarities, key=lambda x: x[1], reverse=True) # find the answer with the highest cosine similarity to the query
  return result[0][0] # return the document name

In [60]:
q = 'создание запросов с использованием контекста' # query
# if a function is used from a local drive (as it is supposed to), it gets path to files as an input
# the number of files may be larger, but for demonstration purposes everything is loaded from github
files_to_scan = [sample_doc_1, sample_doc_2, sample_doc_3]
scanner(q, files_to_scan) # returns sample_doc_2.pdf

'sample_doc_2.pdf'