In [1]:
import os

if "src" not in os.listdir():
    os.chdir("../")

In [2]:
import pandas as pd
from qdrant_client import QdrantClient, models
from FlagEmbedding import BGEM3FlagModel, FlagLLMReranker

from tqdm import tqdm

tqdm.pandas()

In [3]:
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=False, device="cpu")

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


https://github.com/qdrant/workshop-ultimate-hybrid-search/blob/main/notebooks/02-hybrid-search.ipynb

### Создание БД с несколькими векторами

In [4]:
vectors = model.encode(
    "Как мне получить выплаты?",
    return_sparse=True,
    return_dense=True,
    return_colbert_vecs=True,
)
vectors

{'dense_vecs': array([-0.05599624,  0.02769903, -0.04615401, ..., -0.03377588,
        -0.02925543, -0.04329142], dtype=float32),
 'lexical_weights': defaultdict(int,
             {'5187': 0.13252598,
              '4042': 0.15637419,
              '28068': 0.2146028,
              '138417': 0.31710362,
              '32': 0.04425759}),
 'colbert_vecs': array([[-0.04191731,  0.02044222,  0.01853201, ...,  0.0173075 ,
          0.06125861, -0.01522963],
        [-0.03104483,  0.00796479,  0.0223996 , ...,  0.01070459,
          0.04036965,  0.00780059],
        [-0.03103829,  0.02579158,  0.03169292, ...,  0.0075931 ,
          0.04057872, -0.00449112],
        [-0.04662576,  0.0143568 ,  0.05201059, ...,  0.01773259,
          0.04036565,  0.00740162],
        [-0.0451145 ,  0.000516  ,  0.02601848, ...,  0.02121259,
          0.07327194,  0.0132298 ],
        [ 0.0091881 ,  0.00524948, -0.009853  , ...,  0.01237569,
          0.03225253,  0.02549963]], dtype=float32)}

In [5]:
vectors["dense_vecs"].shape, vectors["colbert_vecs"].shape, vectors["lexical_weights"]

((1024,),
 (6, 1024),
 defaultdict(int,
             {'5187': 0.13252598,
              '4042': 0.15637419,
              '28068': 0.2146028,
              '138417': 0.31710362,
              '32': 0.04425759}))

In [6]:
model.convert_id_to_token(
    [
        {
            "5187": 0.13655508,
            "4042": 0.15718421,
            "28068": 0.1956503,
            "207964": 0.22945362,
            "138417": 0.25035983,
            "32": 0.034747005,
        }
    ]
)

{'Как': 0.13655508,
 'мне': 0.15718421,
 'получить': 0.1956503,
 'социальные': 0.22945362,
 'выплаты': 0.25035983,
 '?': 0.034747005}

In [7]:
client = QdrantClient(":memory:")

col_name = "hybrid_search_test"

In [8]:
client.create_collection(
    collection_name=col_name,
    vectors_config={
        "dence": models.VectorParams(
            size=vectors["dense_vecs"].shape[0], distance=models.Distance.COSINE
        ),
        "colbert": models.VectorParams(
            size=vectors["colbert_vecs"].shape[1],
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM
            ),
        ),
    },
    sparse_vectors_config={"text-sparse": models.SparseVectorParams()},
)

True

In [9]:
points = [
    models.PointStruct(
        id=3,
        vector={
            "dence": vectors["dense_vecs"],  # Плотный вектор
            "colbert": vectors["colbert_vecs"],  # Плотный вектор
            "text-sparse": models.SparseVector(
                indices=list(vectors["lexical_weights"].keys()),  # Индексы ненулевых элементов
                values=list(vectors["lexical_weights"].values()),  # Значения ненулевых элементов
            ),
        },
        payload={
            "text": "Пример текста 3",
            "metadata": {"author": "Автор 3"},
        },
    ),
]

client.upsert(
    collection_name=col_name,
    points=points,
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [10]:
sent = ["Как мне получить выплаты?", "социальные службы"]
embeddings_question = model.encode(
    sent,
    return_sparse=True,
    return_dense=True,
    return_colbert_vecs=True,
)
batch = [{"question": i, "answer": i, "rating": i, "id": index} for index, i in enumerate(sent)]

In [11]:
embeddings_question["lexical_weights"][0].keys()

dict_keys(['5187', '4042', '28068', '138417', '32'])

In [12]:
points = [
    models.PointStruct(
        id=record["id"],
        payload={
            "question": record["question"],
            "answer": record["answer"],
            "rating": record["rating"],
        },
        vector={
            "dense": dence,
            "colbert": colbert,
            "text-sparse": models.SparseVector(
                indices=list(sparce.keys()),
                values=list(sparce.values()),
            ),
        },
    )
    for dence, sparce, colbert, record in zip(
        embeddings_question["dense_vecs"],
        embeddings_question["lexical_weights"],
        embeddings_question["colbert_vecs"],
        batch,
    )
]

In [13]:
qdrant_client = QdrantClient(url="localhost:6333")

In [14]:
qdrant_client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='911_hybrid'), CollectionDescription(name='questions'), CollectionDescription(name='questions3'), CollectionDescription(name='questions2')])

In [15]:
point = qdrant_client.query_points(
    collection_name="911_hybrid",
    query=models.SampleQuery(sample=models.Sample.RANDOM),
    limit=1,
    with_vectors=True,
).points[0]

vector = point.vector
question = point.payload["question"]

In [16]:
vector["dense"], vector["text-sparse"], question

([-0.021554949,
  0.032492597,
  -0.011105452,
  -0.012302949,
  -0.012577534,
  0.004362856,
  0.007284139,
  0.02918232,
  0.013973343,
  -0.023995707,
  0.028770441,
  0.039021626,
  -0.0062582577,
  0.008809613,
  -0.034475714,
  0.030860342,
  -0.006529029,
  -0.01076222,
  -0.042194612,
  -0.03295024,
  -0.012813983,
  -0.02451437,
  -0.00216808,
  -0.031012889,
  -0.0017762239,
  0.013088568,
  -0.010579163,
  -0.006422246,
  0.051652554,
  -0.045093015,
  0.01076222,
  -0.010098639,
  0.009229118,
  -0.0205939,
  0.011982599,
  -0.022348195,
  -0.019922692,
  0.015353897,
  -0.025627965,
  -0.0041035255,
  0.033285845,
  0.042591237,
  -0.017237857,
  0.011822424,
  0.010998668,
  -0.0028259407,
  -0.048632115,
  -0.011692759,
  0.002292025,
  -0.029075537,
  -0.031180691,
  0.0057853605,
  0.07718899,
  -0.0032873966,
  -0.037038513,
  0.01569713,
  0.05946298,
  -0.04176748,
  -0.010136776,
  -0.011456311,
  -0.03536049,
  -0.030433208,
  -0.041553915,
  0.037374116,
  0.0011

In [21]:
def convert_texts(text: list):
    st = ""
    for index, text in enumerate(text):
        st += f"Документ {index+1}"
        st += "\n\n\n"
        st += text
        st += "\n\n\n"
    return st


def search_data(collection_name, point, reranker=None, n=10):
    vector = point.vector
    question = point.payload["question"]

    # запрос для разряженного вектора
    sparse = models.Prefetch(
        query=models.SparseVector(
            indices=vector["text-sparse"].indices,
            values=vector["text-sparse"].values,
        ),
        using="text-sparse",
        limit=100,
    )

    # запрос для разряженного вектора
    sparse_1000 = models.Prefetch(
        query=models.SparseVector(
            indices=vector["text-sparse"].indices,
            values=vector["text-sparse"].values,
        ),
        using="text-sparse",
        limit=1000,
    )

    # Запрос для плотного вектора
    dense = models.Prefetch(query=vector["dense"], using="dense", limit=100)

    # Запрос для плотного вектора 1000
    dense_1000 = models.Prefetch(query=vector["dense"], using="dense", limit=1000)

    # Запрашиваем 1000 по плотным векторам из них 100 по разряженным
    dence_sparse = models.Prefetch(
        prefetch=[dense_1000],
        query=models.SparseVector(
            indices=vector["text-sparse"].indices,
            values=vector["text-sparse"].values,
        ),
        using="text-sparse",
        limit=100,
    )

    # Запрашиваем 1000 по разряженным векторам из них 100 по плотным
    sparce_dense = models.Prefetch(
        prefetch=[sparse_1000], query=vector["dense"], using="dense", limit=100
    )

    ## Запрашиваем данные

    record = {}
    record["question"] = question

    if reranker:
        record["model_rerank_type"] = reranker.model_name_or_path

    for name_search_type, search_type in [
        ("dense", [dense]),
        ("sparse", [sparse]),
        ("sparse+dense", [sparse, dense]),
        ("sparce_dense", [sparce_dense]),
        ("dence_sparse", [dence_sparse]),
    ]:
        point = qdrant_client.query_points(
            collection_name=collection_name,
            prefetch=search_type,
            limit=100,
            query=models.FusionQuery(
                fusion=models.Fusion.RRF,
            ),
            timeout=1000,
        ).points

        texts = [i.payload["question"] for i in point[1:]]
        record[name_search_type] = convert_texts(texts[:n])
        record[f"{name_search_type}_len"] = sum([len(i) for i in texts[:n]])

        if reranker:
            print("rerank_start")
            score = reranker.compute_score([[question, i] for i in texts])
            print("reranl_stop")
            texts_score = sorted(
                [(text, score) for text, score in zip(texts, score)],
                key=lambda x: x[1],
                reverse=True,
            )[:n]
            texts = [i[0] for i in texts_score]
            record[f"{name_search_type}_rerank"] = texts
            record[f"{name_search_type}_rerank_len"] = sum([len(i) for i in texts])
            
    return record

In [22]:
# reranker = FlagLLMReranker("BAAI/bge-reranker-v2-gemma", use_fp16=True, device="cpu")
# reranker.compute_score([["owl1", "333 dfff"], ["owl22", "oop ppop op"]])

In [23]:
def get_point(collection_name, n):
    return qdrant_client.query_points(
        collection_name=collection_name,
        query=models.SampleQuery(sample=models.Sample.RANDOM),
        limit=n,
        with_vectors=True,
    ).points

In [None]:
collection_name = "911_hybrid"

points = get_point(collection_name, n=200)
reranker = FlagLLMReranker("BAAI/bge-reranker-v2-m3", use_fp16=True, device="cpu")

records = []
for point in tqdm(points):
    records.append(search_data(collection_name="911_hybrid", point=point, n=5, reranker=reranker))

pd.DataFrame(records).to_csv("./data/interim/rag_results/rag_results.csv")

If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of XLMRobertaForCausalLM were not initialized from the model checkpoint at BAAI/bge-reranker-v2-m3 and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/200 [00:00<?, ?it/s]

rerank_start


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [27]:
pd.DataFrame(records).to_excel("./data/interim/rag_results/rag_results.xlsx")

In [None]:
df_records = pd.DataFrame(records)

df_records.describe()

Unnamed: 0,dense_len,sparse_len,sparse+dense_len,sparce_dense_len,dence_sparse_len
count,200.0,200.0,200.0,200.0,200.0
mean,1584.78,44105.685,28930.47,1799.6,3884.205
std,1336.216425,55871.782774,38865.645246,1445.435397,5791.876304
min,189.0,229.0,207.0,191.0,229.0
25%,586.5,3434.5,1779.75,643.5,1556.0
50%,1277.0,16603.0,5443.5,1444.0,2866.0
75%,2144.0,68814.5,43500.5,2452.0,4322.75
max,9111.0,234500.0,177439.0,8457.0,73822.0
