In [None]:
import pandas as pd
import re
import pickle
import torch

from tqdm import tqdm

from scipy.spatial.distance import cosine

import numpy as np

from IPython.display import clear_output

from langchain.document_loaders import PyPDFLoader
import langchain_core
from langchain_core.documents.base import Document
from langchain.vectorstores import Chroma
import langchain
from langchain.chat_models import gigachat
from langchain.schema import HumanMessage, SystemMessage
from langchain.chat_models.gigachat import GigaChat

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from sentence_transformers import SentenceTransformer

from rank_bm25 import BM25Okapi

import nltk
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords

nltk.download('punkt_tab')
nltk.download('stopwords')
russian_stopwords = stopwords.words("russian")

import chromadb
chroma_client = chromadb.Client()

from google.colab import userdata
API_TOKEN = userdata.get('GIGACHAT')


from retrieval_modules import *
from validation.retriever.metrics import *
from validation_generation import *

from langchain.text_splitter import (RecursiveCharacterTextSplitter,
                                    SentenceTransformersTokenTextSplitter,
                                    TokenTextSplitter,
                                    NLTKTextSplitter,
                                    SpacyTextSplitter
                                    )


device = 'cuda' if torch.cuda.is_available() else 'cpu'


from validation.retriever.splitters import all_splitters

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

### Читаем документы и вопросы

##### Загружаем документы

In [4]:
document_1 = pd.read_csv('risk1.csv')
document_2 = pd.read_csv('risk2.csv')
document_3 = pd.read_csv('requirements.csv')

In [5]:
docs_1 = []
docs_2 = []
docs_3 = []
for row in document_1.iterrows():
    docs_1.append(Document(row[1].loc['content']))
    docs_1[-1].metadata = {'header_1': row[1].loc['Header_1'],
                           'header_2': row[1].loc['Header_2'],
                           'header_3': row[1].loc['Header_3']}

for row in document_2.iterrows():
    docs_2.append(Document(row[1].loc['content']))
    docs_2[-1].metadata = {'header_1': row[1].loc['Header_1'],
                           'header_2': row[1].loc['Header_2'],
                           'header_3': row[1].loc['Header_3']}

for row in document_3.iterrows():
    docs_3.append(Document(row[1].loc['content']))
    docs_3[-1].metadata = {'header_1': row[1].loc['Header_1'],
                           'header_2': row[1].loc['Header_2'],
                           'header_3': row[1].loc['Header_3']}


In [6]:
whole_doc = docs_1 + docs_2 + docs_3

##### Загружаем датасет с вопросами

In [7]:
queries_1 = pd.read_csv('queries1.csv')
queries_2 = pd.read_csv('queries2.csv')
queries_3 = pd.read_csv('queries3.csv')
end_to_end = pd.read_csv('end_to_end.csv')

## Валидация ретривера

In [8]:
def count_all_results(splitters, embedder):
    results = {}
    for i, splitter in tqdm(enumerate(splitters)):

        splitted_docs = splitter.split_documents(whole_doc)

        vectordb = Chroma.from_documents(
            documents=splitted_docs,
            embedding=embedder,
            persist_directory=f'docs/{i}/'
        )

        vectordb.persist()
        params = [{'db' : vectordb, 'strategy' : 'mmr', 'fusion_alpha' : 1.},
                {'db' : vectordb, 'strategy' : 'ss', 'fusion_alpha' : 1.},
                {'db' : vectordb, 'strategy' : 'ss', 'fusion_alpha' : 0.8},
                {'db' : vectordb, 'strategy' : 'ss', 'fusion_alpha' : 0.6},
                {'db' : vectordb, 'strategy' : 'ss', 'fusion_alpha' : 0.4}]



        for param in params:
            name = splitter.name + param['strategy'] + str(param['fusion_alpha'])
            retriever = Retriever(**param)

            results[name] = count_metrics(retriever, 5, pd.concat([queries_1, queries_2, queries_3]), embedder)
    return results

### Валидируем эмбеддер, сплиттер, стретегию, fusion

#### E5 large embedder

In [None]:
embedder_e5 = SentenceTransformer("intfloat/multilingual-e5-large").to(device)
embedder_e5 = Embedder_wrapper(embedder_e5)

e5_large_results = count_all_results(all_splitters, embedder_e5)

with open('./results_e5.pkl', 'wb') as f:
    pickle.dump(e5_large_results, f)

#### E5 large instruct embedder

In [None]:
embedder_e5_instruct = SentenceTransformer('intfloat/multilingual-e5-large-instruct').to(device)
embedder_e5_instruct = Embedder_wrapper_e5_instruct(embedder_e5_instruct)

e5_large_instruct_results = count_all_results(all_splitters, embedder_e5_instruct)

with open('./results_e5_instruct.pkl', 'wb') as f:
    pickle.dump(e5_large_instruct_results, f)

#### Nomic embedder

In [None]:
embedder_nomic = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True).to(device)
embedder_nomic = Embedder_wrapper_nomic(embedder_nomic)

nomic_results = count_all_results(all_splitters, embedder_nomic)

with open('./results_nomic.pkl', 'wb') as f:
    pickle.dump(nomic_results, f)

#### Сравнение результатов

In [None]:
with open('./results_e5.pkl', 'rb') as f:
    e5_large_results = pickle.load(f)

with open('./results_e5_instruct.pkl', 'rb') as f:
    e5_large_instruct_results = pickle.load(f)

with open('./results_nomic.pkl', 'rb') as f:
    nomic_results = pickle.load(f)

In [39]:
get_best_result(e5_large_results)

{'rec_0_1024ss0.8': {'doc_doc_metric_prec =': 0.9704668527436086,
  'doc_doc_metric_rec =': 0.8682144148199921,
  'doc_answer_metric_mean =': 0.8900558444233843,
  'doc_answer_metric_min =': 0.919416591225415,
  'doc_answer_metric_max =': 0.865429140360462,
  'confident_metric =': 0.8543483736489268}}

In [41]:
get_best_result(e5_large_instruct_results)

{'rec_0_2048ss0.8': {'doc_doc_metric_prec =': 0.9842851872572997,
  'doc_doc_metric_rec =': 0.9070975875136202,
  'doc_answer_metric_mean =': 0.9176546164532624,
  'doc_answer_metric_min =': 0.9358704782023652,
  'doc_answer_metric_max =': 0.9023345307871572,
  'confident_metric =': 0.9003318538259646}}

In [40]:
get_best_result(nomic_results)

{'tok_0_1024ss0.8': {'doc_doc_metric_prec =': 0.95607751688986,
  'doc_doc_metric_rec =': 0.8420569504729795,
  'doc_answer_metric_mean =': 0.747104670458384,
  'doc_answer_metric_min =': 0.771534143616316,
  'doc_answer_metric_max =': 0.7224761812854643,
  'confident_metric =': 0.7316245495816368}}

#### Вывод


Лучший результат достигается при таких параметрах: \
embedder = E5_large_instruct \
splitter = recursive_text_splitter \
chunk_size = 2084 \
chunk_overlap = 0 \
strategy = similarity search \
fusion = 0.8

### Валидируем блоки улучшения запроса

In [10]:
embedder_e5_instruct = SentenceTransformer('intfloat/multilingual-e5-large-instruct').to(device)
embedder_e5_instruct = Embedder_wrapper_e5_instruct(embedder_e5_instruct)



In [11]:
splitter = RecursiveCharacterTextSplitter(separators=['\n', '\n\n', '.', '\n\n\n'],
                                                  chunk_overlap=0,
                                                  chunk_size=2048)

splitted_docs = splitter.split_documents(whole_doc)

vectordb = Chroma.from_documents(
    documents=splitted_docs,
    embedding=embedder_e5_instruct,
    persist_directory=f'docs/'
)

In [12]:
model = GigaChat(
    credentials=API_TOKEN,
    scope="GIGACHAT_API_PERS",
    model=["GigaChat", "GigaChat-Pro"][0],
    # Отключает проверку наличия сертификатов НУЦ Минцифры
    verify_ssl_certs=False,
)


In [None]:
best_params = {'db' : vectordb, 'strategy' : 'ss', 'fusion_alpha' : 0.8}

retriever_common = Retriever(**best_params)
retriever_enrich_answer = EnrichAsAnswerRetriever(**best_params, chat_model = model)
retriever_enrich_query_1 = EnrichAsQueryRetriever(**best_params, chat_model = model, query_count=1)
retriever_enrich_correction_1 = EnrichAsCorrectionRetriever(**best_params, chat_model = model, query_count=1)
retriever_enrich_query_3 = EnrichAsQueryRetriever(**best_params, chat_model = model, query_count=3)
retriever_enrich_correction_3 = EnrichAsCorrectionRetriever(**best_params, chat_model = model, query_count=3)
retriever_enrich_query_5 = EnrichAsQueryRetriever(**best_params, chat_model = model, query_count=5)
retriever_enrich_correction_5 = EnrichAsCorrectionRetriever(**best_params, chat_model = model, query_count=5)


In [None]:
retrievers = [retriever_common,
              retriever_enrich_answer,
              retriever_enrich_query_1,
              retriever_enrich_correction_1,
              retriever_enrich_query_3,
              retriever_enrich_correction_3,
              retriever_enrich_query_5,
              retriever_enrich_correction_5]

In [None]:
metrics = []
for retriever in retrievers:
    metrics.append(count_metrics(retriever, 5, pd.concat([queries_1, queries_2, queries_3]), embedder_e5_instruct))

In [None]:
metrics = {f'retriever_{i}' : metric for i, metric in enumerate(metrics)}

In [34]:
get_best_result(metrics)

{'retriever_0': {'doc_doc_metric_prec =': 0.9842851841485388,
  'doc_doc_metric_rec =': 0.9070975826755923,
  'doc_answer_metric_mean =': 0.9176546128695683,
  'doc_answer_metric_min =': 0.9358704764563508,
  'doc_answer_metric_max =': 0.9023345229752581,
  'confident_metric =': 0.9003318579552186}}

#### Вывод

На данный момент улучшение запроса не дает никакого улучшения

### Валидируем реранкеры

In [37]:
from sentence_transformers import CrossEncoder

reranker_model = CrossEncoder('DiTy/cross-encoder-russian-msmarco', max_length=512, device='cuda')


In [38]:
llm_ranker = LLM_Ranker(model)
ce_ranker = CE_ranker(reranker_model)

In [39]:
best_params = {'db' : vectordb, 'strategy' : 'ss', 'fusion_alpha' : 0.8}

retriever_common = Retriever(**best_params)
retriever_llm_ranker = Retriever(**best_params, reranker=llm_ranker, rerank_k=10)
retriever_ce_ranker = Retriever(**best_params, reranker=ce_ranker, rerank_k=10)


In [21]:
count_metrics(retriever_common, 5, pd.concat([queries_1, queries_2, queries_3]), embedder_e5_instruct)

{'doc_doc_metric_prec =': 0.9842851872572997,
 'doc_doc_metric_rec =': 0.9070975875136202,
 'doc_answer_metric_mean =': 0.9176546164532624,
 'doc_answer_metric_min =': 0.9358704782023652,
 'doc_answer_metric_max =': 0.9023345307871572,
 'confident_metric =': 0.9003318538259646}

In [29]:
count_metrics(retriever_llm_ranker, 5, pd.concat([queries_1, queries_2, queries_3]), embedder_e5_instruct)



{'doc_doc_metric_prec =': 0.9820704375658382,
 'doc_doc_metric_rec =': 0.9063953397235599,
 'doc_answer_metric_mean =': 0.9143324008435186,
 'doc_answer_metric_min =': 0.9318976811277602,
 'doc_answer_metric_max =': 0.8994338113869058,
 'confident_metric =': 0.9106183070499576}

In [40]:
count_metrics(retriever_ce_ranker, 5, pd.concat([queries_1, queries_2, queries_3]), embedder_e5_instruct)

{'doc_doc_metric_prec =': 0.9842851841485388,
 'doc_doc_metric_rec =': 0.9070975826755923,
 'doc_answer_metric_mean =': 0.9176546128695683,
 'doc_answer_metric_min =': 0.9358704764563508,
 'doc_answer_metric_max =': 0.9023345229752581,
 'confident_metric =': 0.9157869370432571}

#### Вывод

На данный момент реранкеры не дают улучшения, что неудивительно. Возможно их нужно валидировать end-to-end