In [4]:
import os

if "src" not in os.listdir():
    os.chdir("../")

In [5]:
import pandas as pd
from tqdm import tqdm
from qdrant_client import QdrantClient, models
from FlagEmbedding import FlagLLMReranker

from src.conf import url_qdrant


tqdm.pandas()

https://github.com/qdrant/workshop-ultimate-hybrid-search/blob/main/notebooks/02-hybrid-search.ipynb

### Анализ работы разных поисков и реранков

In [6]:
qdrant_client = QdrantClient(url_qdrant)
collection_name = "911_hybrid_rating_points"

In [7]:
def convert_texts(text: list):
    st = ""
    for index, text in enumerate(text):
        st += f"Документ {index+1}"
        st += "\n\n\n"
        st += text
        st += "\n\n\n"
    return st


def search_data(collection_name, point, reranker_bge=None, reranker_gemma=None, n=10):
    vector = point.vector
    question = point.payload["question"]

    # запрос для разряженного вектора
    sparse = models.Prefetch(
        query=models.SparseVector(
            indices=vector["text-sparse"].indices,
            values=vector["text-sparse"].values,
        ),
        using="text-sparse",
        limit=100,
    )

    # запрос для разряженного вектора
    sparse_1000 = models.Prefetch(
        query=models.SparseVector(
            indices=vector["text-sparse"].indices,
            values=vector["text-sparse"].values,
        ),
        using="text-sparse",
        limit=1000,
    )

    # Запрос для плотного вектора
    dense = models.Prefetch(query=vector["dense"], using="dense", limit=100)

    # Запрос для плотного вектора 1000
    dense_1000 = models.Prefetch(query=vector["dense"], using="dense", limit=1000)

    # Запрашиваем 1000 по плотным векторам из них 100 по разряженным
    dence_sparse = models.Prefetch(
        prefetch=[dense_1000],
        query=models.SparseVector(
            indices=vector["text-sparse"].indices,
            values=vector["text-sparse"].values,
        ),
        using="text-sparse",
        limit=100,
    )

    # Запрашиваем 1000 по разряженным векторам из них 100 по плотным
    sparce_dense = models.Prefetch(
        prefetch=[sparse_1000], query=vector["dense"], using="dense", limit=100
    )

    ## Запрашиваем данные

    record = {}
    record["question"] = question

    if reranker_bge or reranker_gemma:
        record["model_rerank_type"] = [reranker_bge.model_name_or_path, reranker_gemma.model_name_or_path]

    for name_search_type, search_type in [
        ("dense", [dense]),
        ("sparse", [sparse]),
        ("sparse+dense", [sparse, dense]),
        ("sparce_dense", [sparce_dense]),
        ("dence_sparse", [dence_sparse]),
    ]:
        point = qdrant_client.query_points(
            collection_name=collection_name,
            prefetch=search_type,
            limit=100,
            query=models.FusionQuery(
                fusion=models.Fusion.RRF,
            ),
            timeout=1000,
        ).points

        texts = [i.payload["question"] for i in point[1:]]
        record[name_search_type] = convert_texts(texts[:n])
        record[f"{name_search_type}_len"] = sum([len(i) for i in texts[:n]])

        if reranker_bge:
            score = reranker_bge.compute_score([[question, i] for i in texts])
            texts_score = sorted(
                [(text, score) for text, score in zip(texts, score)],
                key=lambda x: x[1],
                reverse=True,
            )[:n]
            texts = [i[0] for i in texts_score]
            record[f"{name_search_type}_reranker_bge"] = convert_texts(texts)
            record[f"{name_search_type}_reranker_bge_len"] = sum([len(i) for i in texts])
        
        if reranker_gemma:
            score = reranker_gemma.compute_score([[question, i] for i in texts])
            texts_score = sorted(
                [(text, score) for text, score in zip(texts, score)],
                key=lambda x: x[1],
                reverse=True,
            )[:n]
            texts = [i[0] for i in texts_score]
            record[f"{name_search_type}_reranker_gemma"] = convert_texts(texts)
            record[f"{name_search_type}_reranker_gemma_len"] = sum([len(i) for i in texts])
            
    return record

In [8]:
def get_point(collection_name, n):
    return qdrant_client.query_points(
        collection_name=collection_name,
        query=models.SampleQuery(sample=models.Sample.RANDOM),
        limit=n,
        with_vectors=True,
    ).points

In [9]:
points = get_point(collection_name, n=20)
reranker_gemma = FlagLLMReranker("BAAI/bge-reranker-v2-gemma", use_fp16=True)
reranker_bge = FlagLLMReranker("BAAI/bge-reranker-v2-m3", use_fp16=True)

records = []
for point in tqdm(points):
    records.append(search_data(collection_name="911_hybrid", point=point, n=5, reranker_gemma=reranker_gemma, reranker_bge=reranker_bge))

pd.DataFrame(records).to_csv("./data/interim/rag_results/rag_results.csv")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of XLMRobertaForCausalLM were not initialized from the model checkpoint at BAAI/bge-reranker-v2-m3 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/20 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 7/7 [00:02<00:00,  2.80it/s]
You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get 

In [10]:
records[0]

{'question': 'Можно ли снять 14 б и через какое время',
 'model_rerank_type': ['BAAI/bge-reranker-v2-m3',
  'BAAI/bge-reranker-v2-gemma'],
 'dense': 'Документ 1\n\n\nКак снять ст 14 б куда идти  и что писать.\n\n\nДокумент 2\n\n\nКак снять статью 14 б что надо для этого сделать и куда оброщаться.\n\n\nДокумент 3\n\n\nЗдравствуйте. Списали со службы по ст.18 б. через сколько её можно снять?\n\n\nДокумент 4\n\n\nМожно потом снять ст.15 и если можно то по истечении какого срока?\n\n\nДокумент 5\n\n\nХочу снять диагноз 14 б.\n\n\n',
 'dense_len': 272,
 'dense_reranker_bge': 'Документ 1\n\n\nЯ собираюсь в отпуск с последующим увольнением.  Могут ли меня заставить отрабатывать 14 дней?\n\n\nДокумент 2\n\n\nЯ собираюсь в очередной отпуск с последующим увольнением.  Могут ли заставить меня отрабатывать 14 дней?\n\n\nДокумент 3\n\n\nВопрос: могу ли я пойти в отпуск на 14 дней и и следующим днем после отпуска  уволиться по собственному желанию?\n\n\nДокумент 4\n\n\nКакой срок на это отпускается?

In [11]:
pd.DataFrame(records).to_excel("./data/interim/rag_results/rag_results.xlsx")

In [12]:
df_records = pd.DataFrame(records)

df_records.describe()

Unnamed: 0,dense_len,dense_reranker_bge_len,dense_reranker_gemma_len,sparse_len,sparse_reranker_bge_len,sparse_reranker_gemma_len,sparse+dense_len,sparse+dense_reranker_bge_len,sparse+dense_reranker_gemma_len,sparce_dense_len,sparce_dense_reranker_bge_len,sparce_dense_reranker_gemma_len,dence_sparse_len,dence_sparse_reranker_bge_len,dence_sparse_reranker_gemma_len
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,1319.65,1615.8,1615.8,57444.95,6960.95,6960.95,22906.95,8618.65,8618.65,1650.0,2326.8,2326.8,2834.8,2206.65,2206.65
std,865.405976,1064.339607,1064.339607,98807.369587,11117.203243,11117.203243,38329.211705,22736.928356,22736.928356,1166.488568,1523.055917,1523.055917,2422.867779,1524.937558,1524.937558
min,272.0,427.0,427.0,320.0,473.0,473.0,212.0,702.0,702.0,268.0,294.0,294.0,304.0,546.0,546.0
25%,570.25,528.25,528.25,1859.5,2520.0,2520.0,1519.5,1531.0,1531.0,658.75,888.5,888.5,989.75,816.75,816.75
50%,1196.0,1619.0,1619.0,5517.5,3740.0,3740.0,3767.5,3167.0,3167.0,1495.0,2353.5,2353.5,2102.0,1620.5,1620.5
75%,1738.75,2433.75,2433.75,78195.5,5497.25,5497.25,24859.5,4154.5,4154.5,1988.25,3492.75,3492.75,4598.25,3496.5,3496.5
max,3530.0,3352.0,3352.0,405693.0,50061.0,50061.0,133145.0,103860.0,103860.0,4629.0,5235.0,5235.0,9692.0,5200.0,5200.0
