This notebook contains the final experiment with the best result obtained for RuBERT after testing different parameters sets and data augmentation

In [1]:
# install necessary packages

!pip install datasets accelerate transformers tqdm simpletransformers deep_translator gradio pdfminer-six wget

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting simpletransformers
  Downloading simpletransformers-0.70.0-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio
  Downloading gradio-4.31.0-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m 

In [2]:
import pandas as pd
import numpy as np
import torch

import sklearn
import wget

from tqdm import tqdm
from torch.utils.data import TensorDataset

from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, Dataset

from deep_translator import GoogleTranslator

from pdfminer.high_level import extract_text

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
# russian subset of xquad

xquad_dataset = load_dataset('xquad', 'xquad.ru')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/322k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

In [5]:
xquad_dataset = xquad_dataset['validation'].train_test_split(test_size=0.2) # first split into train and test
train_set = xquad_dataset['train']
validation_set = xquad_dataset['test']
validation_split_set = validation_set.train_test_split(test_size=0.5) # now test to validation and test
val_set = validation_split_set['train']
test_set = validation_split_set['test']

In [6]:
batch_size = 16 # 32 or 64 is a recommended size, 16 provides better results, but is more expensive in computational terms
max_length = 512 # max length of the model input
stride = 128 # 1/4 of max model input length is recommended for starters; needed to keep the information that doesn't fit in the model

In [7]:
# preparing the data

def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",  # truncate context, not the question, bc all questions fit into the max length
        max_length=max_length,
        stride=stride,
        return_overflowing_tokens=True, # make sure no data is lost
        return_offsets_mapping=True, # start and end positions
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
    return tokenized_examples

In [8]:
# evaluation

def evaluate_instance(instance, device):
    context = instance['context']
    question = instance['question']
    given_answer = instance['answers']['text'][0]
    inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        output = model(**inputs)
    start_idx = torch.argmax(output.start_logits)
    end_idx = torch.argmax(output.end_logits)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))
    return predicted_answer.lower() == given_answer.lower()

In [9]:
# translation

def en_ru_translator(text):
    translator = GoogleTranslator(source='en', target='ru')
    return translator.translate(text)

# 1. Training an LLM for a context-based question answering task

## 1.1. Initial fine-tuning on the dataset

For this part a similar example was used: https://amitnikhade.medium.com/question-answering-in-association-with-roberta-a11518e70507 <br>
Another useful tutorial was this one: https://medium.com/@anyuanay/fine-tuning-the-pre-trained-bert-model-in-hugging-face-for-question-answering-8edc76890ce0

In [10]:
model_name = 'DeepPavlov/rubert-base-cased-conversational'

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name) # new tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)



tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
tokenized_train = train_set.map(prepare_train_features,
                                batched=True,
                                remove_columns=train_set.column_names)
tokenized_val = val_set.map(prepare_train_features,
                                batched=True,
                                remove_columns=val_set.column_names)

Map:   0%|          | 0/952 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

In [13]:
training_args = TrainingArguments(
    output_dir="./rubert-results",
    evaluation_strategy="epoch",
    learning_rate=5e-06,
    per_device_train_batch_size=8, # smaller batch sizes improve the accuracy
    per_device_eval_batch_size=8, # same here
    num_train_epochs=6, # starts overfitting from here
    weight_decay=0.3, # learning rate scheduling; tried 0, 0.3, 0.5
    warmup_steps=0, # also scheduling; tried 0, 500 and 1000, did not change a lot
    logging_dir='./logs', # for easier debugging
    logging_steps=10,
    fp16=False, # default
    do_train=True, # default is False
    do_eval=True, # same here
    overwrite_output_dir=True # in case there is a saved copy of the model, it needs to be changed for a new one
)

In [14]:
# from here: https://huggingface.co/docs/transformers/main_classes/trainer
# does all the training work

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer)

In [15]:
model.to(device) # training on GPU

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [16]:
trainer.train() # takes a few monutes, but not too long (faster with A100 GPU)

# requires an API key from wandb, it can be easily obtained in a couple of clicks
# not provided here, bc this notebook is publicly available

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,4.355,4.052419
2,3.5811,3.296562
3,2.4155,2.47396
4,1.972,2.305115
5,1.9442,2.25796
6,1.9859,2.235161


TrainOutput(global_step=732, training_loss=2.9996456233530098, metrics={'train_runtime': 693.6345, 'train_samples_per_second': 8.399, 'train_steps_per_second': 1.055, 'total_flos': 1522314904743936.0, 'train_loss': 2.9996456233530098, 'epoch': 6.0})

In [18]:
correct_count = 0
total_count = test_set.shape[0]

for i in range(total_count):
    correct_count += evaluate_instance(test_set[i], device)

In [19]:
accuracy = correct_count / total_count
print(f'Accuracy: {accuracy * 100:.2f}%') # 26.05%

Accuracy: 20.17%


In [21]:
# saving the model

# torch.save(model.state_dict(), MODEL_PATH) # was saved on Google drive initially, MODEL_PATH was a path on Drive

## 1.2. Augmentation

In [22]:
# augmentation with chinese dataset was extremely poor, so here the english dataset is used

en_xquad_dataset = load_dataset('xquad', 'xquad.en')

Downloading data:   0%|          | 0.00/212k [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

In [23]:
# reinitializing the russian dataset, because it was changed by splitting
# and here we need it unchanged

xquad_dataset = load_dataset('xquad', 'xquad.ru')

In [24]:
data_en = pd.DataFrame(en_xquad_dataset['validation'])
data = pd.DataFrame(xquad_dataset['validation'])

data_en.head(3) # text is the same, translated professionally

Unnamed: 0,id,context,question,answers
0,56beb4343aeaaa14008c925b,"The Panthers defense gave up just 308 points, ...",How many points did the Panthers defense surre...,"{'text': ['308'], 'answer_start': [34]}"
1,56beb4343aeaaa14008c925c,"The Panthers defense gave up just 308 points, ...",How many career sacks did Jared Allen have?,"{'text': ['136'], 'answer_start': [470]}"
2,56beb4343aeaaa14008c925d,"The Panthers defense gave up just 308 points, ...",How many tackles did Luke Kuechly register?,"{'text': ['118'], 'answer_start': [789]}"


In [25]:
tqdm.pandas() # progress bar

In [26]:
# takes a while

data_en.loc[:, 'context'] = data_en.context.progress_apply(en_ru_translator)
data_en.loc[:, 'question'] = data_en.question.progress_apply(en_ru_translator)

100%|██████████| 1190/1190 [07:25<00:00,  2.67it/s]
100%|██████████| 1190/1190 [17:41<00:00,  1.12it/s]


In [27]:
data_en.context[0] # translation (quite similar, but there are some changes of the form)

'Защита «Пантерс» набрала всего 308 очков, заняв шестое место в лиге, а также лидируя в НФЛ по перехватам с 24 очками и имея четыре выхода в Пробоул. Защитник Pro Bowl Каванн Шорт возглавил команду по количеству мешков с 11, а также совершил три фамбла и отыграл два. Его коллега Марио Аддисон добавил 6½ мешков. В составе «Пантерс» также присутствовал ветеран защиты Джаред Аллен, пятикратный профессиональный боулер, который был активным лидером НФЛ по мешкам в карьере со 136 мешками, а также защитник Кони Или, у которого было 5 мешков всего за 9 стартов. После них для участия в Пробоуле также были выбраны двое из трех стартовых полузащитников «Пантеры»: Томас Дэвис и Люк Кючли. Дэвис собрал 5½ мешков, четыре форсированных фамбла и четыре перехвата, в то время как Кючли возглавил команду по отборам мяча (118), совершил два фамбла и перехватил четыре собственных передачи. Во второстепенном матче Каролины участвовал защитник Про Боула Курт Коулман, который возглавил команду с рекордом в ка

In [28]:
data.context[0] # russian text

'\ufeffЗащита Пэнтерс уступила всего 308 очков, заняв шестое место в лиге, а также лидировала в НФЛ по перехватам с 24 и похвасталась четырьмя попаданиями в Пробоул. Дифенсив тэкл Пробоула Кейван Шорт лидирует в команде с 11 мешками, а также обеспечил три потери мяча и получил два. Нападающий Марио Эдисон добавил 6½ мешков. Линия Пэнтерс также представила ди-энда-ветерана Джареда Аллена, пятикратного участника Пробоула, который был активным лидером по количеству мешков в карьере НФЛ в количестве 136, вместе с ди-эндом Кони Или, у которого было 5 мешков всего за 9 стартов. Позади них для участия в Пробоуле также были выбраны два из трех стартовых лайнбекеров Пэнтерс: Томас Дэвис и Люк Кикли. Дэвис собрал 5½ мешков, четыре вынужденных потери мяча и четыре перехвата, в то время как Кикли лидировал в команде по блокировкам (118), форсировал две потери мяча и перехватил четыре своих передачи. Второй по популярности в Каролине сэйфти Пробоула Курт Колеман, который был лидером команды с макси

In [29]:
# answers have a different structure, so they will be translated separately to keep it

answers_list = data_en.answers.tolist()
translator = GoogleTranslator(source='en', target='ru')

for answer in answers_list:
  txt = answer['text'][0]
  t_txt = translator.translate(txt)
  answer['text'] = [t_txt]

data_en['answers'] = answers_list

In [30]:
# joining two datasets

augmented_data = pd.concat([data, data_en],
                           ignore_index=True,
                           axis=0)

In [31]:
augmented_dataset = Dataset.from_pandas(augmented_data) # to Dataset

In [32]:
# splitting in the same way as before

X = augmented_dataset.train_test_split(test_size=0.2)
train = X['train']
validation = X['test']

validation_split = validation.train_test_split(test_size=0.5)
val = validation_split['train']
test = validation_split['test']

In [33]:
tokenized_train = train.map(prepare_train_features,
                            batched=True,
                            remove_columns=train.column_names)
tokenized_val = val.map(prepare_train_features,
                        batched=True,
                        remove_columns=val.column_names)

Map:   0%|          | 0/1904 [00:00<?, ? examples/s]

Map:   0%|          | 0/238 [00:00<?, ? examples/s]

In [34]:
# same trainer and training arguments

training_args = TrainingArguments(
    output_dir="./rubert-aug-results", # writitng to another folder
    evaluation_strategy="epoch",
    learning_rate=5e-06,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.3,
    warmup_steps=0,
    logging_dir='./logs',
    logging_steps=10,
    fp16=False,
    do_train=True,
    do_eval=True,
    overwrite_output_dir=True
)

In [35]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer
)

In [36]:
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.3096,3.263378
2,2.8699,3.219459
3,2.6695,3.205886
4,2.3005,3.2133
5,2.4279,3.252344
6,2.3556,3.265086


TrainOutput(global_step=1452, training_loss=2.7533577092751327, metrics={'train_runtime': 1246.1093, 'train_samples_per_second': 9.303, 'train_steps_per_second': 1.165, 'total_flos': 3028952004083712.0, 'train_loss': 2.7533577092751327, 'epoch': 6.0})

In [38]:
correct_count = 0
total_count = test.shape[0]

for i in range(total_count):
    correct_count += evaluate_instance(test[i], device)

In [39]:
accuracy = correct_count / total_count
print(f'Accuracy: {accuracy * 100:.2f}%') # 28.57%, improved a bit

Accuracy: 30.25%


# Search among files

In [40]:
# first running a small and obvious example to see how the model works

model.eval()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [41]:
question = 'Куда мне нужно попасть завтра?'
context = 'Завтра я еду в офис, чтобы отнести документы в отдел кадров.'

In [42]:
inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)

In [45]:
output = model(**inputs.to(device))
start_idx = torch.argmax(output.start_logits)
end_idx = torch.argmax(output.end_logits)
predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))
predicted_answer # not perfect, but technically correct

'офис, чтобы отнести документы в отдел кадров'

Now working with pdf files. The sample files are located on github. <br>
Working with pdf is tricky, because there are multiple nuances in encoding and reading such files (especially in russian). But it is an interesting challenge

In [46]:
sample_doc_1_path = 'https://github.com/NataliyaPovarova/HSE_Thesis/blob/main/sample_doc_1.pdf?raw=true'
sample_doc_2_path = 'https://github.com/NataliyaPovarova/HSE_Thesis/blob/main/sample_doc_2.pdf?raw=true'
sample_doc_3_path = 'https://github.com/NataliyaPovarova/HSE_Thesis/blob/main/sample_doc_3.pdf?raw=true'

In [47]:
# download from github

sample_doc_1 = wget.download(sample_doc_1_path)
sample_doc_2 = wget.download(sample_doc_2_path)
sample_doc_3 = wget.download(sample_doc_3_path)

In [48]:
# an example

extracted_text = extract_text(sample_doc_2, codec='utf-8')
print(extracted_text)

Создание хорошего запроса к моделям обработки естественного языка, таким как 
большие языковые модели (LLM), может значительно повысить качество и точность 
получаемых ответов. Вот несколько советов, которые помогут вам сформулировать 
эффективный запрос на русском языке: 

1. **Четкость и конкретика**: Чем более точно и конкретно сформулирован запрос, тем 
легче модели понять, что от нее требуется. Ясно определите тему или задачу, которую вы 
хотите обсудить или решить. 

2. **Полные вопросы**: Формулируйте полноценные вопросы, а не краткие или 
неполные фразы. Это помогает модели лучше понять контекст вашего запроса. 

3. **Использование контекста**: Если ваш запрос связан с предыдущим обсуждением или 
конкретной областью знаний, укажите этот контекст в своем запросе. Это поможет модели 
адаптировать свои ответы под нужную область. 

4. **Языковые особенности**: Учитывайте языковые и культурные особенности. При 
обращении к модели на русском языке убедитесь, что ваш запрос составлен 

In [57]:
# the function to embed the text

def get_embedding(text):
    inputs = tokenizer(text,
                       return_tensors="pt",
                       padding=True,
                       truncation=True,
                       max_length=512)
    outputs = model(**inputs.to(device), output_hidden_states=True)
    # use the mean of last layer hidden states as the embedding
    embeddings = outputs.hidden_states[-1].mean(1)
    return embeddings

In [54]:
# the function, which scanns all the files and looks for the most relevant one

def scanner(query, files_to_scan): # changed from scanner(query, path) -> for local / google drive
  cos = torch.nn.CosineSimilarity(dim=1).to(device) # using cosine similarity to sort the answers
  answers = [] # list to store the name of the document and relevant information in it

  # this section is to reproduce from local / google drive
  # files_in_folder = os.listdir(path)
  # files_to_scan = [file for file in files_in_folder if '.pdf' in file] # make sure we only work with pdfs now
  for f in files_to_scan:
    # text = extract_text(path + f, codec='utf-8') # read a file from drive
    text = extract_text(f, codec='utf-8') # read a file from list
    inputs = tokenizer(query,
                       text,
                       return_tensors='pt',
                       max_length=512,
                       truncation=True) # tokenize the query and the file text
    output = model(**inputs.to(device))
    start_idx = torch.argmax(output.start_logits)
    end_idx = torch.argmax(output.end_logits)
    # find the answer to the query in the document (if any)
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))
    answers.append([f, predicted_answer]) # save
  # now calculating cosine similarity
  query_embedding = get_embedding(query) # now embedding to calculate similarity
  answers_embeddings = [] # same for the answers
  for answer in answers:
    answers_embeddings.append([answer[0], get_embedding(answer[1])])
  cosine_similarities = [] # calculating cosine similarities
  for ans in answers_embeddings:
    cosine_similarities.append([ans[0], cos(query_embedding, ans[1])])
  result = sorted(cosine_similarities, key=lambda x: x[1], reverse=True) # find the answer with the highest cosine similarity to the query
  return result[0][0] # return the document name

In [58]:
# tryout

q = 'создание запросов с использованием контекста' # query
# if a function is used from a local drive (as it is supposed to), it gets path to files as an input
# the number of files may be larger, but for demonstration purposes everything is loaded from github
files_to_scan = [sample_doc_1, sample_doc_2, sample_doc_3]
scanner(q, files_to_scan) # returns sample_doc_2.pdf, which is the expected behaviour

'sample_doc_2.pdf'