In [1]:
import fitz
from tqdm.auto import tqdm
import re
import pandas as pd
import gc
from llama_cpp import Llama
import torch
import numpy as np
from spacy.lang.ru import Russian
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
from transformers import AutoTokenizer
from datasets import Dataset
import requests
import json
import os
import uuid
import time

  from .autonotebook import tqdm as notebook_tqdm


# Text preprocessing

In [2]:
# Define helper function to print wrapped text
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [3]:
def read_jsonl(file_name):
    with open(file_name, encoding="utf-8") as r:
        return [json.loads(line) for line in r]


def write_jsonl(records, path):
    with open(path, "w", encoding="utf-8") as w:
        for r in records:
            w.write(json.dumps(r, ensure_ascii=False) + "\n")

In [4]:
def text_formatter(text: str) -> str:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number, character count, word count, and the extracted text for each page.
    """    
    # Define the regular expression pattern to match a hyphen at the end of a line
    pattern = r'-\n|\xad\s'
    # Use re.sub to replace the hyphen with an empty string and concatenate the next line
    processed_text = re.sub(pattern, '', text)
    processed_text = processed_text.replace("\n", " ").strip()

    return processed_text


def longest_common_substring(str1: str, 
                             str2: str) -> str:
    """
    Finds the longest common substring between two input strings.

    Parameters:
        str1 (str): The first input string.
        str2 (str): The second input string.

    Returns:
        The longest common substring found in both input strings.
    """
    # Create a 2D array to store lengths of longest common suffixes of substrings
    m, n = len(str1), len(str2)
    lcsuff = [[0] * (n + 1) for i in range(m + 1)]
    length = 0  # To store length of the longest common substring
    end_pos = 0  # To store end position of the longest common substring in str1

    # Building the lcsuff table in bottom-up fashion
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                lcsuff[i][j] = 0
            elif str1[i - 1] == str2[j - 1]:
                lcsuff[i][j] = lcsuff[i - 1][j - 1] + 1
                if lcsuff[i][j] > length:
                    length = lcsuff[i][j]
                    end_pos = i
            else:
                lcsuff[i][j] = 0

    # The longest common substring is from end_pos - length to end_pos in str1
    return str1[end_pos - length: end_pos]


def longest_substring_dict(doc:object,
                           introduction: int=0,
                           number_pages: int=15) -> dict:
    """
    Extracts the longest common substrings from a document and counts their occurrences.

    This function processes a given document by extracting text from a specified
    range of pages and finds the longest common substrings between consecutive pages.

    Parameters:

    doc (object): The document object from which text is extracted.

    introduction (int, optional): The starting page number from which text extraction
                                  begins. Default is 0.
    number_pages (int, optional): The number of pages to process from the starting
                                  page. Default is 15.        

    Returns:
        dict: A dictionary where keys are the longest common substrings found between
              consecutive pages, and values are their counts of occurrences.
    """
    examples = []
    for page_number, page in tqdm(enumerate(doc[introduction : number_pages])): 
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        examples.append(text[:100])

    longest_common = examples[0]
    longest_common_dict = {}
    for i in range(1, len(examples)):
        longest_common = longest_common_substring(examples[i-1], examples[i])
        if longest_common_dict.get(longest_common):
            longest_common_dict[longest_common] += 1
        else:
            longest_common_dict[longest_common] = 1

    return longest_common_dict


def intro_and_conlusion_page(doc: object,
                             search_first: str,
                             search_second: str):
    """
    Finds the page numbers for the "Introduction" and "Conclusion" sections in a document.

    This function iterates through the pages of a document

    Parameters:
        doc (object): The document object from which text is extracted.

        search_first (str): The term to search for in the document to identify the "Introduction"
                            section.

        search_second (str): The term to search for in the document to identify the "Conclusion"
                            section.
    Returns:
        tuple: A tuple containing two elements:
           - The page number where the "Introduction" section starts.
           - The page number where the "Conclusion" section starts.     
    """    
    # Iterate through the pages to find the "Introduction"
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text().lower()  
        if search_first in text:
            introduction = page_num 
            break

    # Iterate through the pages to find the "Conclusion"
    for page_num in range(0, -len(doc), -1):
        page = doc[page_num]
        text = page.get_text().lower()  
        if search_second in text:
            conlusion = page_num 
            break

    return introduction, conlusion



def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF document, identifies and extracts text between the "Introduction" and "Bibliography" sections,
    removes specific recurring text patterns, and returns a list of dictionaries containing page details.

    This function performs the following steps:
    1. Opens the specified PDF document.
    2. Identifies the pages where the "Introduction" and "Bibliography" sections are located.
    3. Finds the most common recurring text pattern at the beginning of each page (assumed to be a standard header or footer).
    4. Removes this recurring pattern from the text of each page.
    5. Extracts and formats the text from each page between the "Introduction" and "Bibliography" sections.
    6. Returns a list of dictionaries, each containing the page number, character count, word count, and the cleaned text for each page.

    Parameters:
    pdf_path (str): The file path to the PDF document to be processed.

    Returns:
    list[dict]: A list of dictionaries, each containing the following keys:
                - "page_number" (int): The page number in the extracted range.
                - "page_char_count" (int): The number of characters on the page after cleaning.
                - "page_word_count" (int): The number of words on the page after cleaning.
                - "text" (str): The cleaned text content of the page.
    """
    
    # open a document
    search_first = "введение".lower()
    search_second = "библиография".lower()
    introduction = 0
    conlusion = -1
    doc = fitz.open(pdf_path)
    
    # добавить try except на наличие введения и библиографии, иначе ничего не обрезать
    introduction, conlusion = intro_and_conlusion_page(doc, search_first, search_second)
    longest_common_dict = longest_substring_dict(doc, introduction)
    first_name_gost_on_every_page = max(longest_common_dict, key=longest_common_dict.get)

    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc[introduction : conlusion])):  # iterate the document pages
        text = page.get_text()  
        text = text_formatter(text)
        text = re.sub(first_name_gost_on_every_page, '', text, count=1) # remove first GOST on every page
        pages_and_texts.append({"page_number": page_number,  
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "text": text})
    return pages_and_texts

In [5]:
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]


def spacy_preprocessing_chunk(pages_and_texts: list[dict]) -> list[dict]:
    """
    Processes and chunks the text from pages using spaCy for sentence segmentation, and organizes the text into manageable chunks.

    This function performs the following steps:
    1. Uses spaCy to segment the text of each page into sentences.
    2. Counts the number of sentences on each page.
    3. Computes an average sentence chunk size based on the mean number of sentences per page.
    4. Splits the sentences into chunks of the computed size.
    5. Creates a list of dictionaries, each containing details of these chunks, while filtering out chunks with fewer than a specified number of words.

    Parameters:
    pages_and_texts (list[dict]): A list of dictionaries, where each dictionary contains:
                                  - "page_number" (int): The page number in the extracted range.
                                  - "page_char_count" (int): The number of characters on the page after cleaning.
                                  - "page_word_count" (int): The number of words on the page after cleaning.
                                  - "text" (str): The cleaned text content of the page.

    Returns:
    list[dict]: A list of dictionaries, each containing:
                - "page_number" (int): The page number from which the chunk was extracted.
                - "sentence_chunk" (str): The text content of the chunk.
                - "chunk_char_count" (int): The number of characters in the chunk.
                - "chunk_word_count" (int): The number of words in the chunk.
                
    The function also adds the following keys to each input dictionary in `pages_and_texts`:
                - "sentences" (list[str]): The sentences in the page's text.
                - "page_sentence_count_spacy" (int): The number of sentences in the page's text.
                - "sentence_chunks" (list[list[str]]): The text split into chunks of sentences.
    """

    nlp_simple = Russian()
    nlp_simple.add_pipe('sentencizer')

    # how many sentence in page
    for item in tqdm(pages_and_texts):
        item["sentences"] = list(nlp_simple(item["text"]).sents)
        # Make sure all sentences are strings
        item["sentences"] = [str(sentence) for sentence in item["sentences"]]
        # Count the sentences
        item["page_sentence_count_spacy"] = len(item["sentences"])

    num_sentence_chunk_size = np.round(sum(d['page_sentence_count_spacy'] for d in pages_and_texts) 
                                       / len(pages_and_texts)).astype(int)# size chunk based on mean sentences on the page

    # Chunking our sentences together
    for item in tqdm(pages_and_texts):
        item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                            slice_size=num_sentence_chunk_size)

    #Split each chunk into its own item
    pages_and_chunks = []
    min_word_count = 20 # min word should be in chunk

    for item in tqdm(pages_and_texts):
        for sentence_chunk in item["sentence_chunks"]:
            chunk_dict = {}
            chunk_dict["page_number"] = item["page_number"]

            # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
            joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
            joined_sentence_chunk = re.sub(r'\.([А-Я])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
            chunk_dict["sentence_chunk"] = joined_sentence_chunk.lower()

            # Get stats about the chunk
            chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
            chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
            
            # remove chunk, which lesser min_word_count
            if chunk_dict["chunk_word_count"] < min_word_count:
                continue
            else:
                pages_and_chunks.append(chunk_dict)

    return pages_and_chunks

# Creation dataset for finetune

In [46]:
def prompt_yagpt(context: str) -> dict:
    """
    Creates a prompt for YandexGPT to generate a dictionary of questions and answers based on provided context.

    Parameters:
            context (str): The context text based on which the questions and answers should be generated.

    Returns:
             dict: A dictionary formatted to be used as a prompt for YandexGPT, containing the model URI, completion options, and messages.
    """
    promtpt_ya = {
                "modelUri": "gpt://b1gl4dfg2eqii24mp1fp/yandexgpt-lite",
                "completionOptions": {
                    "stream": False,
                    "temperature": 0.05,
                    "maxTokens": "2000"
                },
                "messages": [
                    {
                    "role": "system",
                    "text": """
                    Задание: На основе предоставленного контекста, создай словарь  вопросов и ответов, которые содержат основную суть контекста.
                      Каждый элемент словаря должен представлять собой пару: вопрос (ключ) и соответствующий ответ (значение).
                    Формат ответа: { "Вопрос": "string", "Ответ": "string" }     
                    """
                    },
                    {
                    "role": "user",
                    "text": f"Контекст: {context}"
                    }
                ]
                }
    
    return promtpt_ya

In [80]:
def create_ya_dataset(text_chunks: list[str],
                      url: str, 
                      headers: dict) -> list[dict]:
    """
    Generates a dataset of questions and answers from text chunks using YandexGPT.

    This function takes a list of text chunks, sends each chunk to YandexGPT to generate
    questions and answers, and compiles these into a list of dictionaries. Each dictionary
    contains a question and its corresponding answer, along with a unique identifier.

    Parameters:
                text_chunks (list[str]): A list of text chunks to process.
                url (str): The URL for the API endpoint.
                headers (dict): A dictionary of headers to include in the API request.

    Returns:
    list[dict]: A list of dictionaries, each containing:
                - "Вопрос" (str): The generated question.
                - "Ответ" (str): The corresponding answer.
                - "id" (str): A unique identifier for the question-answer pair.
    
    The function filters out any question-answer pairs where the question is identical to the answer.
    """

    # Regular expression to extract all words
    pattern = r'\{.*?\}'
    list_of_dict = []

    for context in tqdm(text_chunks):
        prompt = prompt_yagpt(context)
        
        json_prompt = json.dumps(prompt, indent = 4) # Convert Python to JSON  
        result = requests.post(url, headers=headers, data=json_prompt)

        if result.status_code == 429:
            # Error Too Many Requests
            time.sleep(5)
            result = requests.post(url, headers=headers, data=json_prompt)
        
        question_answer = result.json()
        json_question_answer = question_answer['result']['alternatives'][0]['message']['text']
        dict_from_json = re.findall(pattern, json_question_answer, re.DOTALL)

        for match in dict_from_json:
            try:
                json_dict = json.loads(match)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError: {e}")
                continue

            json_dict['id'] = str(hash(json_dict['Вопрос']))
            list_of_dict.append(json_dict)

    list_of_dict_no_dupl = [qa_dict  for qa_dict in list_of_dict if qa_dict['Вопрос'] != qa_dict['Ответ']] # clear llm bugs: answer == question

    return list_of_dict_no_dupl

In [134]:
def remove_duplicates_ids(ya_dataset):
    unique_data = {}
    for item in ya_dataset:
        unique_data[item['id']] = item
        
    unique_data_list = list(unique_data.values())

    return unique_data_list

In [81]:
pattern = r'\{.*?\}'
list_of_dict = []

url = 'https://llm.api.cloud.yandex.net/foundationModels/v1/completion'

headers = {
    "Content-Type": "application/json",
    'Authorization': f'Api-Key AQVN1WIE8wCWyEUClNkkAvo5djWbHmxuD6h27kxu'}

PATH_DIR = "D:\\my_project"
data_documents = os.listdir(PATH_DIR)
filtered_data_documents = list(filter(lambda path: path.endswith('.pdf'), data_documents))
path_to_filtered_data_documents  = list(map(lambda x: os.path.join(PATH_DIR, x), filtered_data_documents))

ya_dataset = []
for pdf_path in path_to_filtered_data_documents:
    pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
    pages_and_chunks_over_min_word_len = spacy_preprocessing_chunk(pages_and_texts)
    text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_word_len]
    list_of_dict_no_dupl = create_ya_dataset(text_chunks, url, headers)
    ya_dataset.extend(list_of_dict_no_dupl)


result_ya_dataset = remove_duplicates_ids(ya_dataset)

11it [00:00, 285.85it/s]
38it [00:00, 363.08it/s]
100%|██████████| 38/38 [00:00<00:00, 250.46it/s]
100%|██████████| 38/38 [00:00<?, ?it/s]
100%|██████████| 38/38 [00:00<00:00, 15137.58it/s]
 55%|█████▌    | 31/56 [02:00<01:43,  4.15s/it]

JSONDecodeError: Expecting ',' delimiter: line 4 column 3 (char 149)


100%|██████████| 56/56 [03:53<00:00,  4.17s/it]
12it [00:00, 283.72it/s]
51it [00:00, 349.92it/s]
100%|██████████| 51/51 [00:00<00:00, 247.52it/s]
100%|██████████| 51/51 [00:00<?, ?it/s]
100%|██████████| 51/51 [00:00<00:00, 25510.97it/s]
100%|██████████| 76/76 [05:00<00:00,  3.95s/it]


JSONDecodeError: Expecting ',' delimiter: line 2 column 28 (char 29)


11it [00:00, 422.05it/s]
26it [00:00, 472.69it/s]
100%|██████████| 26/26 [00:00<00:00, 318.54it/s]
100%|██████████| 26/26 [00:00<?, ?it/s]
100%|██████████| 26/26 [00:00<00:00, 25983.30it/s]
 62%|██████▏   | 21/34 [01:18<00:59,  4.54s/it]

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)


100%|██████████| 34/34 [02:33<00:00,  4.51s/it]
10it [00:00, 322.12it/s]
30it [00:00, 467.20it/s]
100%|██████████| 30/30 [00:00<00:00, 217.22it/s]
100%|██████████| 30/30 [00:00<?, ?it/s]
100%|██████████| 30/30 [00:00<00:00, 14992.15it/s]
  5%|▍         | 2/44 [00:07<02:34,  3.67s/it]

JSONDecodeError: Invalid control character at: line 3 column 42 (char 117)


100%|██████████| 44/44 [02:41<00:00,  3.68s/it]


In [95]:
def convert_to_gpt_style_dataset(list_of_dict_no_dupl: list[dict]) -> Dataset:
    """
    Converts a list of question-answer dictionaries into a GPT-style dataset format.

    Parameters:
    list_of_dict_no_dupl (list[dict]): A list of dictionaries, each containing:
                                       - "Вопрос" (str): The question.
                                       - "Ответ" (str): The answer.

    Returns:
        Dataset: A Hugging Face Dataset object containing the formatted question-answer pairs.
                Each entry in the dataset is a conversation with the following structure:
                - "messages" (list[dict]): A list of messages in the conversation, each represented as a dictionary with:
                                            - "role" (str): The role of the speaker ("user" or "assistant").
                                            - "content" (str): The content of the message (the question or the answer).
    """
    q_a_res = []
    for q_a_pair in list_of_dict_no_dupl:
        question = q_a_pair['Вопрос']
        answer = q_a_pair['Ответ']
        chat1 = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer}]
        q_a_res.append(chat1)

    dataset_raw = Dataset.from_dict({"messages": q_a_res})
    
    return dataset_raw

In [138]:
dataset_raw = convert_to_gpt_style_dataset(result_ya_dataset)

In [139]:
write_jsonl(dataset_raw, "ya_dataset.json")

In [140]:
dataset_raw = read_jsonl("ya_dataset.json")
dataset_raw = Dataset.from_list(dataset_raw)

# qdrant

In [6]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct

QDRANT_URL = "http://localhost:6333"
client = QdrantClient(url=QDRANT_URL)

In [7]:
PATH_DIR = "D:\\my_project"
data_documents = os.listdir(PATH_DIR)
filtered_data_documents = list(filter(lambda path: path.endswith('.pdf'), data_documents))
path_to_filtered_data_documents  = list(map(lambda x: os.path.join(PATH_DIR, x), filtered_data_documents))
path_to_filtered_data_documents


['D:\\my_project\\4293773880.pdf',
 'D:\\my_project\\4293774215.pdf',
 'D:\\my_project\\51213.pdf',
 'D:\\my_project\\56236.pdf']

In [8]:
# tokenizer = AutoTokenizer.from_pretrained("IlyaGusev/saiga_llama3_8b")
# model_name = "model-q2_K"

tokenizer = AutoTokenizer.from_pretrained("D:\\my_project\\lora_adapter")
model_name = "llama_8b_gost.Q4_K_M"
model_path=f"D:\\my_project\\{model_name}.gguf"

model_llm = Llama(
    model_path=model_path,
    verbose=True,
    n_ctx=4048,
    n_gpu_layers=-1
)
gc.collect()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from D:\my_project\llama_8b_gost.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = model
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.

925

In [9]:
# embedding_model for the sentence
device = "cuda:0" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
embedding_model = embedding_model.to(device)




In [44]:
def create_embedding_collection(path_to_filtered_data_documents: list[str],
                                size_embedding: int,
                                collection_name: str = "gost_document_embeddings",
                                 ):
    """
    Creates and uploads embeddings for PDF documents to a specified collection.

    This function processes a list of PDF documents, extracts text, preprocesses the text using spaCy,
    generates embeddings for text chunks, and uploads these embeddings to a collection.

    Parameters:
            path_to_filtered_data_documents (list[str]): A list of file paths to the PDF documents to be processed.

    Returns:
            None: The function uploads embeddings to the collection and does not return any value.
    """

    if client.collection_exists(collection_name=collection_name):
        print("collection already exist")
        
    else:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=size_embedding, distance=Distance.DOT),
        )


    for pdf_path in (path_to_filtered_data_documents):
        
        pages_and_texts = open_and_read_pdf(pdf_path = pdf_path)
        pages_and_chunks_over_min_word_len = spacy_preprocessing_chunk(pages_and_texts)

        text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_word_len]

        text_chunk_embeddings = embedding_model.encode(text_chunks,
                                        batch_size=32, # you can use different batch sizes here for speed/performance
                                        convert_to_tensor=True) # optional to return embeddings as tensor instead of array
        
        # Create multiple point structures
        points = [
            PointStruct(
                    id = uuid.uuid4().hex,
                    vector = embedding_vector.tolist(),
                    payload = {
                        'document_name': os.path.basename(pdf_path),
                        'page_number': pages_and_chunks_over_min_word_len[idx]['page_number'],
                        'sentence_chunk': pages_and_chunks_over_min_word_len[idx]["sentence_chunk"]
                        }
                )
                for idx, embedding_vector in enumerate(text_chunk_embeddings)]
            
        client.upload_points(
                            collection_name,
                            points=points,
                            batch_size=64,
                            )
        


In [45]:
size_embedding = embedding_model._modules['1'].word_embedding_dimension

create_embedding_collection(path_to_filtered_data_documents, size_embedding)

collection already exist


11it [00:00, 289.96it/s]
38it [00:00, 376.78it/s]
100%|██████████| 38/38 [00:00<00:00, 247.26it/s]
100%|██████████| 38/38 [00:00<?, ?it/s]
100%|██████████| 38/38 [00:00<00:00, 22464.21it/s]
12it [00:00, 266.72it/s]
51it [00:00, 353.63it/s]
100%|██████████| 51/51 [00:00<00:00, 254.89it/s]
100%|██████████| 51/51 [00:00<?, ?it/s]
100%|██████████| 51/51 [00:00<00:00, 25501.85it/s]
11it [00:00, 464.39it/s]
26it [00:00, 485.59it/s]
100%|██████████| 26/26 [00:00<00:00, 315.11it/s]
100%|██████████| 26/26 [00:00<?, ?it/s]
100%|██████████| 26/26 [00:00<?, ?it/s]
10it [00:00, 391.83it/s]
30it [00:00, 490.84it/s]
100%|██████████| 30/30 [00:00<00:00, 226.03it/s]
100%|██████████| 30/30 [00:00<?, ?it/s]
100%|██████████| 30/30 [00:00<00:00, 11976.88it/s]


In [27]:
def prompt_formatter_qdrant(query: str,
                            context_items: list ) -> str:
    """
    Formats a query and context items into a prompt for a language model.

    This function takes a query and a list of context items, formats them into a structured prompt, 
    and applies a chat template for use with a language model.

    Parameters:
            query (str): The question or query to be answered.
            context_items (list): A list of context items, where each item has a payload containing a "sentence_chunk".

    Returns:
    str: A formatted prompt string for use with a language model.
    """

    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item.payload["sentence_chunk"] for item in context_items])

    base_prompt = f"""
    Контекст: 
    {context}\n\n
    Используя контекст, ответь на вопрос: {query}"""
    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [42]:
def give_answer_llm_qdrant(query: str,
                           collection_name: str = "gost_document_embeddings"):
    """
    Generates an answer to a query using context from multiple collections.

    Parameters:
            query (str): The query or question for which an answer is sought.

    Returns:
            str: The generated answer from the language model based on the provided query and context.
    """
    search_result = client.search(
                                collection_name=collection_name,
                                query_vector=embedding_model.encode(query),
                                limit=5
                            )
    
    print('Relevant information')
    for i in search_result:
        print(i)
    # Format prompt with context items
    prompt = prompt_formatter_qdrant(query=query,
                                    context_items=search_result)

    output_text = model_llm(prompt, 
                        max_tokens=1000, 
                        stop=model_llm.token_eos(), 
                        )
    return output_text

In [43]:
# Пример запроса
query = "Что такое качество излучения?"

output_text = give_answer_llm_qdrant(query)

print(f"Вопрос: {query}")
print_wrapped(f"RAG ответ:\n{output_text['choices'][0]['text']}")

Relevant information
id='d02d3ea0-fa3c-42c9-a715-58eb03e557d5' version=4 score=11.713477 payload={'document_name': '56236.pdf', 'page_number': 4, 'sentence_chunk': '3.30 облучение (irradiation): воздействие излучения на материалы или живые существа. в радиологии — воздействие на живой организм или материал ионизирующим излучением, например. рентгеновское облучение.3.31 выключатель облучения (irradiation switch): в радиологическом изделии — устройство управления, обеспечивающее начало и/или прекращение облучения.3.32 время облучения (irradiation time): продолжительность облучения, определяемая по специальным методам; обычно время, в течение которого радиационная величина превышает определенный уровень.3.33 излучение утечки (leakage radiation): ионизирующее излучение, прошедшее через защитный экран источника излучения, а также ионизирующее излучение, которое в рентгеновских генераторах некоторых типов выходит через родиационное окно перед нагрузкой и после (например, в аппарате с рентген

Llama.generate: prefix-match hit

llama_print_timings:        load time =     249.88 ms
llama_print_timings:      sample time =      19.87 ms /    53 runs   (    0.37 ms per token,  2667.74 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (-nan(ind) ms per token, -nan(ind) tokens per second)
llama_print_timings:        eval time =     835.40 ms /    53 runs   (   15.76 ms per token,    63.44 tokens per second)
llama_print_timings:       total time =    1160.08 ms /    53 tokens


Вопрос: Что такое качество излучения?
RAG ответ: В контексте медицинской радиологии под термином "качество излучения"
понимается соответствие рентгеновского излучения определенным параметрам
нагрузки, обеспечивающим необходимое для медицинских целей изображение.
