In [25]:
# !pip install evaluate==0.4.3
# !pip install llama-cpp-python==0.1.9
# !pip install pinecone-client==5.0.1
# !pip install langchain_community==0.2.16
# !pip install langchain-chroma==0.1.4
# !pip install chromadb==0.5.11
# !pip install sentence-transformers==3.1.1
# !pip install ctransformers

In [26]:
from langchain_community.document_loaders import PDFMinerLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader, UnstructuredHTMLLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from multiprocessing.pool import ThreadPool
from langchain_chroma.vectorstores import Chroma
from langchain.schema import Document
from chromadb.config import Settings
from chromadb import Client
from llama_cpp import Llama
from evaluate import load
from typing import Any
from tqdm import tqdm
from pathlib import Path

In [27]:
import pandas as pd
import numpy as np
import statistics
import pinecone
import glob
import os

In [28]:
path_to_index = '/content/VDB'
path_to_documents = 'nlp-24-autumn/projects/dataset/20news-bydate-train/comp.graphics' #49960.txt

In [29]:
# Словарь, сопоставляющий расширения файлов с соответствующими загрузчиками данных и их параметрами
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".pdf": (PDFMinerLoader, {}),
    ".txt": (TextLoader, {"encoding": "ISO-8859-1"}),
}

In [30]:
# Параметры конфигурации для векторного поиска и разделения текста
INDEX_NAME = "VDB"  # Название индекса для хранения векторных представлений
COLLECTION_NAME = "document_collection"
EMBEDDINGS = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"  # Название модели эмбеддингов, используемой для векторизации текстов
SIZE = 250  # Размер фрагмента текста для разделения документов
OVERLAP = 50  # Перекрытие между фрагментами текста для обеспечения контекста

## Loader

In [31]:
# Класс для загрузки документов из различных источников, поддерживающий работу с разными форматами файлов
class Loader:
    def load_single_document(self, file_path: str):
        # Метод для загрузки одного документа на основе пути к файлу
        ext = Path(file_path).suffix.lower()
        if ext in LOADER_MAPPING:
            loader_class, loader_args = LOADER_MAPPING[ext]
            loader_args['file_path'] = file_path;
            loader = loader_class(**loader_args)
            document = loader.load()
            return document
        else:
            raise ValueError(f"Unsupported file extension: {ext}")

    def load_documents(self, source_dir: str):
        # Метод для загрузки всех документов из указанной директории
        documents = []
        if os.path.isfile(source_dir):
          documents.extend(self.load_single_document(file_path))
        else:
          for root, _, files in os.walk(source_dir):
              for file_name in files:
                  file_path = os.path.join(root, file_name)
                  try:
                      document = self.load_single_document(file_path)
                      documents.extend(document)
                  except ValueError as e:
                      print(e)
        return documents

In [None]:
loader = Loader()

example_document = loader.load_documents(path_to_documents)

print(example_document)

## Splitter

In [33]:
# Класс для разделения документов на фрагменты определённого размера с заданным перекрытием
class Splitter:
    def __init__(self, chunk_size, chunk_overlap):
        # Инициализация параметров разделения: размер фрагмента и величина перекрытия
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_documents(self, documents):
        fragments = []
        for document in documents:
            text = document.page_content
            doc_meta = document.metadata
            start = 0

            while start < len(text):
                end = min(start + self.chunk_size, len(text))
                fragment_text = text[start:end]

                fragment = {
                    "text": fragment_text,
                    "metadata": doc_meta
                }

                fragments.append(fragment)
                start += self.chunk_size - self.chunk_overlap

        return fragments


In [None]:
splitter = Splitter(SIZE, OVERLAP)

example_fragments = splitter.split_documents(example_document)

example_fragments

##Vector database

In [35]:
from sentence_transformers import SentenceTransformer

# Базовый класс для создания эмбеддингов, обеспечивающий интерфейс для получения модели эмбеддингов
class Embedder:
    def __init__(self, model_name: str):
        # Инициализация эмбеддера
        self.model = SentenceTransformer(model_name)

    def get_embedder(self):
        # Метод для получения модели эмбеддингов, которая будет использоваться для векторизации текстов
        return self.model

    def encode(self, texts: list[dict]):
        return self.model.encode(texts)

In [36]:
class HuggingFaceEmbedder(Embedder):
    def __init__(self):
        super().__init__(EMBEDDINGS)

In [None]:
embedder = HuggingFaceEmbedder()

example_embedded_fragments = embedder.encode(example_fragments)

example_embedded_fragments

### Класс Collector

In [40]:
# Базовый класс для работы с коллекцией документов, поддерживающий добавление, поиск и очистку данных
class Collector:
    def __init__(self, splitter: Splitter, embedder: Embedder):
        self.splitter = splitter
        self.embedder = embedder
        self.loader = Loader()

    def add(self, texts: list[str], metadatas: list[dict]):
        # Метод для добавления текстов и связанных с ними метаданных в коллекцию
        embeddings = self.embedder.encode(texts)
        return [{"embedding": embedding, "metadata": metadata} for embedding, metadata in zip(embeddings, metadatas)]

    def add_from_directory(self, dir_path: str):
        # Метод для добавления документов в коллекцию из указанной директории
        documents = self.loader.load_documents(dir_path)
        fragments = self.splitter.split_documents(documents)

        texts = [fragment["text"] for fragment in fragments]
        metadatas = [fragment["metadata"] for fragment in fragments]
        self.add(texts, metadatas)

    def _cosine_similarity(self, vec1, vec2):
        dot_product = sum(a * b for a, b in zip(vec1, vec2))
        norm1 = sum(a ** 2 for a in vec1) ** 0.5
        norm2 = sum(b ** 2 for b in vec2) ** 0.5
        return dot_product / (norm1 * norm2)

    def query_documents(self, embedding, top_k):
        pass

    def get(self, search_strings: list[str], n_results: int) -> list[Document]:
        # Метод для поиска документов по строкам запроса с ограничением на количество результатов
        search_embeddings = self.embedder.encode(search_strings)
        results = []
        for search_embedding in search_embeddings:
            result_docs = self.query_documents(search_embedding, top_k=n_results)
            results.extend(result_docs)
        return results

    def get_documents(self, search_string: str, n_results: int, score_threshold: float) -> list[Document]:
        # Метод для поиска документов с учётом порога релевантности и количества возвращаемых результатов
        search_embedding = self.embedder.encode([search_string])[0]
        result_docs = self.query_documents(search_embedding, top_k=n_results)
        return [doc for doc in result_docs if doc.score >= score_threshold]


class ChromaCollector(Collector):
    def __init__(self, splitter: Splitter, embedder: HuggingFaceEmbedder):
        super().__init__(splitter, embedder)
        self.client = Client()
        self.collection = self.client.get_or_create_collection(COLLECTION_NAME)
        self.doc_id_counter = 0

    def add(self, texts: list[str], metadatas: list[dict]):
        # Добавляем документы с их эмбеддингами и метаданными в коллекцию
        added_documents = super().add(texts, metadatas)
        embeddings = np.array([doc["embedding"] for doc in added_documents]).astype("float32")

        ids = [f"id_{self.doc_id_counter + i}" for i in range(len(added_documents))]
        self.doc_id_counter += len(added_documents)

        self.collection.add(
            documents=texts,
            embeddings=embeddings.tolist(),
            metadatas=[doc["metadata"] for doc in added_documents],
            ids=ids
        )

    def query_documents(self, query_embedding, top_k: int):
        results = self.collection.query(
            query_embeddings=[query_embedding.tolist()],
            n_results=top_k
        )

        ids = results['ids'][0]
        distances = results['distances'][0]
        metadatas = results['metadatas'][0]
        documents = results['documents'][0]

        results_list = []
        for i in range(len(ids)):
            results_list.append({
                "id": ids[i],
                "distance": distances[i],
                "metadata": metadatas[i],
                "document": documents[i]
            })

        return results_list


    def clear(self):
        self.client.delete_collection(COLLECTION_NAME)

###Implementation vector database

In [41]:
# path_to_index = '/content/VDB' #@param {type:"string"}
# path_to_documents = '/content/tmp' #@param {type:"string"}

In [42]:
#Нужно написать реализацию векторной базы данных

## Search

In [43]:
query = 'How can I clean a suede jacket?' #@param {type:"string"}
n_results = 5 #@param {type:"integer"}
score_threshold = 0.5 # @param {type:"slider", min:0, max:1, step:0.1}


In [None]:
#Нужно реализовать эксперимент по поиску в векторном индексе
example_splitter = Splitter(SIZE, OVERLAP)
example_embedder = HuggingFaceEmbedder()
exampple_collector = ChromaCollector(splitter, embedder)

exampple_collector.clear()
exampple_collector = ChromaCollector(example_splitter, example_embedder)

exaple_documents = [
        "The sun is an average star.",
        "The moon orbits the Earth.",
        "I enjoy cooking and baking.",
        "The galaxy is vast and full of stars.",
        "Artificial intelligence is transforming industries."
    ]

example_metadatas = [
    {'file_path': 'dir_1'},
    {'file_path': 'dir_2'},
    {'file_path': 'dir_3'},
    {'file_path': 'dir_4'},
    {'file_path': 'dir_5'},
    ]


exampple_collector.add(exaple_documents, example_metadatas)
exampple_collector.collection.peek()

example_query = "What is sun?"
example_query_embedding = embedder.encode([example_query])[0]
exampple_collector.query_documents(example_query_embedding, 2)

In [45]:
splitter = Splitter(SIZE, OVERLAP)
embedder = HuggingFaceEmbedder()
collector = ChromaCollector(splitter, embedder)

collector.clear()
collector = ChromaCollector(splitter, embedder)

In [46]:
path_to_documents = '../../dataset/20news-bydate-train/comp.graphics'

collector.add_from_directory(path_to_documents)

In [None]:
collector.collection.peek(500)

In [None]:
search_queries = ["What tools does include ImageMagick?"]

for query in search_queries:
    print(f"\nSearch query: {query}")
    results = collector.get([query], n_results=n_results)

    for i, doc in enumerate(results):
        print(f"Result {i + 1}: {doc}")

## Evaluation

In [49]:
# Класс для оценки работы коллектора, предоставляющий функционал для поиска, оценки и расчета статистики по результатам
class CollectorEvaluator:
    def __init__(self, collector: Collector, n_top=5):
        # Инициализация коллектора и параметра n_top для ограничения числа возвращаемых результатов
        self.collector = collector
        self.n_top = n_top

    def explore_collector(self, text: str):
        # Метод для поиска документов в коллекторе на основе текста запроса
        collector_results = self.collector.get([text], n_results=self.n_top)
        return collector_results

    def eval(self, query: str, answer: str):
        # Метод для оценки корректности найденных документов на основе запроса и правильного ответа
        collector_results = self.explore_collector(query)

        print(f"\nSearch query: {query},\nanswer: {answer}")

        for i, doc in enumerate(collector_results, start=1):
            print(f"Result {i}: {doc}")

        for i, doc in enumerate(collector_results, start=1):
            if answer in doc["document"]:
                return i
        return None

    def calculate_statistics(self, data: list[int]):
        # Метод для расчета статистических показателей (например, минимальное, максимальное, среднее значение)

        filtered_data = [serial_number for serial_number in data if serial_number is not None]
        not_found_queries = len([serial_number for serial_number in data if serial_number is None])

        if not filtered_data:
            return {"min": None, "max": None, "mean": None, "not found": None}

        min_serial_number = min(filtered_data)
        max_serial_number = max(filtered_data)
        mean_serial_number = sum(filtered_data) / len(filtered_data)
        return {"min": min_serial_number, "max": max_serial_number, "mean": mean_serial_number, "not_found": not_found_queries / len(data)}

    def explore_and_calculate(self, data: list[tuple[str, str]]):
        # Метод для проведения поиска по данным и расчета статистики на основе результатов
        collector_results = []
        for query, answer in data:
            serial_number = self.eval(query, answer)
            collector_results.append(serial_number)

        stats = self.calculate_statistics(collector_results)
        return stats

In [50]:
path_to_dataset = '/content/QA.csv' #@param {type:"string"}
n_lines = 100 #@param {type:"integer"}
n_top = 10 #@param {type:"integer"}



In [None]:
#Нужно написать эксперимент для оценки полученной коллекции
example_collector = ChromaCollector(example_splitter, example_embedder)
example_evaluator = CollectorEvaluator(example_collector, n_top=n_top)

example_data = [
    ("What is the sun?", "The sun is an average star."),
    ("What is the moon?", "The moon orbits the Earth."),
    ("Cooking tips", "I enjoy cooking and baking.")
]

example_stats = example_evaluator.explore_and_calculate(example_data)
print("Статистика по позициям релевантных ответов:", example_stats)


In [52]:
data = [
    ('What are some examples of toolkits that can be used for image format conversion and basic image manipulations?', 'umber of toolkits for converting from one image format to\nanother, doing simple image manipulations such as size scaling, plus\nthe above-mentioned 24 -> 8, color -> gray, gray -> b&w conversions.\nHere are pointers to some of them:\n\n    xv by John Bra'),
    ('What techniques are discussed in the context of quantizing 24-bit images down to 8 bits, and where can one find a relevant reference on this topic?', 'for\nshading, chapter 19 for clipping, and branch out from there.\n\n\n3) Quantizing 24 bit images down to 8 bits.\n\nFind a copy of "Color Image Quantization for Frame Buffer Display" by\nPaul Heckbert, SIGGRAPH \'82 Proceedings, page 297.  There are other\n'),
    ('How to FTP by email', ' 9) Converting between vector formats.\n    10) How to get Pixar films.\n    11) How do I draw a circle as a Bezier (or B-spline) curve?\n    12) How to order standards documents.\n    13) How to FTP by email.\n    14) How to tell whether a point is withi'),
    ('What steps should you take to obtain information about using the mail handler and software distribution?', ' exercises.  To receive information describing\nhow you can use the mail handler, simply mail graphtext@cs.brown.edu\nand put the word "Help" in the Subject line.  Use the Subject line\n"Software-Distribution" to receive information specifically concern'),
    ('How to join ACM/SIGGRAPH\n', 'trace height fields\n    24) How to find the area of a 3D polygon\n    25) How to join ACM/SIGGRAPH\n    26) Where can I find MRI and CT scan volume data?\n    27) Specific references on spatial data structures including quadtrees\n\tand octrees\n    28) Wh'),
    ('How to get general information about the\nmail server?', '/news.answers/pictures-faq/part1\nsend usenet/news.answers/pictures-faq/part2\n\nSend a message containing "help" to get general information about the\nmail server.\n\nAlso, you could check out the resources described in sections 7, 8, and\n20 above for mor'),
    ('How many tool the kit contains on image manipulation, digital halftoning?', 'rting pixels of arbitrary channels,\n    components, and bit precisions while allowing compression and machine\n    byte-order independence.  The kit contains more than 50 tools with\n    extensive support of image manipulation, digital halftoning and f'),
    ('A Fast Algorithm for Raster\nRotation', 'implementation is\nalso present in PBMPLUS.  Reference: "A Fast Algorithm for Raster\nRotation", by Alan Paeth (awpaeth@watcgl.waterloo.edu) Graphics\nInterface \'86 (Vancouver).  An article on the IM toolkit appears in\nthe same journal.  An updated vers'),
    ('What are some examples of formats that can be converted or rendered by commercial PostScript clones for PCs?', " to Sun raster format, or HPGL to\nX11 bitmap.  For example, some of the commercial PostScript clones for\nPC's allow you to render to a disk file as well as a printer.  Also,\nthe PostScript interpreters in the NeXT box and in Sun's X11/NeWs can\nbe use"),
    ('Why is assembly language used for over 100 functions in the graphical interface?', "short or floating point arithmetic to maintain the precision\n  and accuracy of the pixel format. Over 100 functions are hand-coded in\n  assembly language for maximum speed on the Intel hardware.  The entire\n  graphical interface is also written in as")
]

In [None]:
#Нужно написать эксперимент для оценки полученной коллекции
collector = ChromaCollector(splitter, embedder)
evaluator = CollectorEvaluator(collector, n_top=n_top)

stats = evaluator.explore_and_calculate(data)
print("Статистика по позициям релевантных ответов:", stats)

In [54]:
def generate(model_path, prompts, n_ctx=2000, top_k=30, top_p=0.9, temperature=0.2, repeat_penalty=1.1):
  #Реализовать генерацию текста с помощью LLM модели
  llm = Llama(model_path=model_path, n_ctx=n_ctx)

  if isinstance(prompts, str):
    prompts = [prompts]

  responses = []

  for prompt in prompts:
    response = llm(prompt,
                   top_k=top_k, top_p=top_p, temperature=temperature, 
                  #  repetition_penalty=repeat_penalty
                   )
    responses.append(response)

  return responses

In [55]:
class QuestionAndAnswers:
    # Класс для представления вопросов и ответов
    def __init__(self, question, correct_answer, generated_answer=None, prompt=None):
        self.question = question
        self.correct_answer = correct_answer
        self.generated_answer = generated_answer
        self.prompt = prompt

    def __repr__(self):
      return f"Q: {self.question}\nCorrect: {self.correct_answer}\nGenerated: {self.generated_answer}\n"

In [56]:
class Dataset:
    # Класс для представления набора данных, содержащего вопросы и ответы.
    def __init__(self, qa_list):
      self.qa_list = qa_list

def get_prompt(question, context):
  return f'Answear the question: "{question}", using context: {context}'

### Generation

In [None]:
#Нужно написать эксперимент для генерации текста (ответа на вопрос) с помощью функции generate
model_file="../assets/models/mistral-7b-openorca.Q4_K_M.gguf" 

example_dataset = Dataset([
    QuestionAndAnswers(
        question='What is sun?',
        correct_answer='The sun is an average star.'),
])

for qa in example_dataset.qa_list:
  context = example_collector.get([qa.question], n_results=1)
  qa.prompt = get_prompt(qa.question, context)

  qa.generated_answer = generate(model_file, [qa.prompt])
  print(qa)

In [None]:
qa.generated_answer[0]['choices'][0]['text']

In [None]:
#Нужно написать эксперимент для генерации текста (ответа на вопрос) с помощью функции generate
model_file="../assets/models/mistral-7b-openorca.Q4_K_M.gguf" 

dataset = Dataset([
    QuestionAndAnswers(
        question='How does Tom Van Flandern view the concept of "dark matter" and other unobservable phenomena in physics?',
        correct_answer='Tom Van Flandern is skeptical of "dark matter" and other unobservable, purely theoretical constructs in physics, such as quarks and black holes. He questions whether their existence can be inferred solely from theory, suggesting that existence should be tied to observability.'
        ),
        QuestionAndAnswers(
        question='What is the main point of disagreement between Tom Van Flandern and Bruce Scott on the concept of existence in physics?',
        correct_answer='The main disagreement is that Bruce Scott argues "existence" should be synonymous with "observable" in physics, while Van Flandern challenges this view, particularly when considering phenomena like curvature, which he argues cannot exist without something "non-curved" to compare it to.'
        ),
        QuestionAndAnswers(
        question='According to Nikola Tesla, why does he believe that space cannot be curved?',
        correct_answer='Nikola Tesla argues that space cannot be curved because it has no properties on its own. He believes properties only apply to matter within space, and saying that large bodies curve space implies "something can act upon nothing," a view he does not support.'
        ),
        QuestionAndAnswers(
        question='What is the escape velocity equation in a circular orbit, and how does it relate to circular orbital velocity?',
        correct_answer=' The escape velocity Vesc in a circular orbit is given by the equation Vesc = sqrt(2 * M * G / r) = sqrt(2) * Vс  is the circular orbital velocity. This means the escape velocity is approximately 1.41 times the circular orbital velocity.'
        ),
        QuestionAndAnswers(
        question='What is the formula for calculating the Schwarzschild radius of a black hole, and what constants does it involve?',
        correct_answer="The Schwarzschild radius of a black hole is calculated using the formula 2GM/c^2, where G is Newton's gravitational constant, M is the mass of the black hole, and c is the speed of light."
        ),
        QuestionAndAnswers(
        question='Where are the Saturn V blueprints kept, and what is the main challenge in recreating the rocket?',
        correct_answer='The Saturn V blueprints are kept at the Marshall Space Flight Center on microfilm. The main challenge in recreating the rocket is not finding the drawings, but sourcing 1960s-era hardware, such as guidance components, and the fact that launch facilities have been modified for the Space Shuttle.'
        ),
        QuestionAndAnswers(
        question="Why isn't data from space missions immediately available to the public after collection?",
        correct_answer="NASA allows mission investigators exclusive access to data for one year after it's collected, giving them a chance to analyze and publish their results without competition. However, NASA often releases sample photos to the public early in a mission."
        ),
        QuestionAndAnswers(
        question="What is the estimated environmental impact of the Space Shuttle's Solid Rocket Boosters on the ozone layer?",
        correct_answer="The impact of the Space Shuttle's Solid Rocket Boosters on the ozone layer is minimal, contributing less than 0.25% of total stratospheric chlorine sources. The effect on global ozone levels is estimated to be a decrease of only 0.0065%."
        ),
        QuestionAndAnswers(
        question='What risks are associated with nuclear (RTG) power sources on space probes, and what evidence exists about their safety?',
        correct_answer='Studies suggest that risks from nuclear RTG power sources on space probes are very low, even in worst-case scenarios, such as launch failures or reentry. For example, in 1968, two RTGs were recovered intact after a satellite failure, and in 1970, the Apollo 13 RTG fell into the ocean and remains safely contained.'
        ),
        QuestionAndAnswers(
        question="Why can't the Space Shuttle be used for missions beyond low Earth orbit?",
        correct_answer='The Space Shuttle cannot be used for missions beyond low Earth orbit because it lacks sufficient fuel and is not designed for such missions. Its wings and other structural features are only useful near Earth, making it inefficient and costly for higher orbits.'
        ),
])

for qa in dataset.qa_list:
  context = collector.get([qa.question], n_results=1)
  qa.prompt = get_prompt(qa.question, context)

  qa.generated_answer = generate(model_file, [qa.prompt])
  print(qa)

In [None]:
print(dataset.qa_list)

### Evaluation

In [None]:
import bert_score
from typing import List

class BERTScoreEvaluator:
    # Класс для оценки качества сгенерированных ответов с использованием метрики BERTScore.
    def __init__(self, model_type='distilbert-base-uncased'):
        self.model_type = model_type
    
    def evaluate(self, reference: str, generated: str):
        P, R, F1 = bert_score.score([generated], [reference], model_type=self.model_type)
        return {
            'precision': P.item(),
            'recall': R.item(),
            'f1': F1.item()
        }

    def evaluate_dataset(self, dataset: Dataset):
        references = [qa.correct_answer for qa in dataset.qa_list]
        generated_answers = [qa.generated_answer[0]['choices'][0]['text'] for qa in dataset.qa_list]

        print(references)
        print(generated_answers)

        P, R, F1 = bert_score.score(generated_answers, references, model_type=self.model_type)
        return {
            'precision_mean': P.mean().item(),
            'recall_mean': R.mean().item(),
            'f1_mean': F1.mean().item()
        }


In [None]:
#Нужно написать эксперимент для оценки сгенерированых ответов
bertScoreEvaluator = BERTScoreEvaluator()

bert_result = bertScoreEvaluator.evaluate_dataset(dataset)
print(bert_result)