# Search System

---

### Requirements

In [None]:
# %pip install dotenv opensearch-py qdrant_client datasets FlagEmbedding tqdm ipywidgets matplotlib numpy sentence-transformers torch

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENSEARCH_INITIAL_ADMIN_PASSWORD = os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")

In [None]:
from opensearchpy import AsyncOpenSearch

from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from qdrant_client.models import Distance, VectorParams
from qdrant_client.models import Filter, FieldCondition, MatchValue

import torch
import random
import asyncio
import itertools
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from dataclasses import dataclass
from collections import defaultdict

import matplotlib.pyplot as plt
from FlagEmbedding import BGEM3FlagModel

### Docker

In [None]:
!docker pull opensearchproject/opensearch:latest
!docker pull qdrant/qdrant 

In [None]:
!docker ps

In [None]:
!docker run -d --rm \
    -p 6333:6333 -p 6334:6334 \
        -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
            qdrant/qdrant
!docker run -d --rm \
    -p 9200:9200 -p 9600:9600 \
        -e "discovery.type=single-node" \
            -e OPENSEARCH_INITIAL_ADMIN_PASSWORD=$OPENSEARCH_INITIAL_ADMIN_PASSWORD \
                opensearchproject/opensearch:latest

### Data 

In [None]:
raw_data = load_dataset("tasksource/esci")

In [None]:
@dataclass(frozen=True)
class TextFields:
    product_title: str
    product_description: str
    product_bullet_point: str
    product_brand: str
    product_color: str
    product_text: str

@dataclass
class DatasetPart:
    queries: dict[str, int]
    query2doc2target: dict[str, dict[str, str]]

In [None]:
TextFields.__annotations__

In [None]:
def process(part):
    example_ids = set()
    docs = dict()
    queries = dict()
    query2doc2target = defaultdict(dict)
    for point in tqdm(part):
        if point["example_id"] in example_ids:
            continue
        example_ids.add(point["example_id"])
        if point["product_id"] not in docs:
            docs[point["product_id"]] = TextFields(*[point[name] for name in TextFields.__annotations__])
        if point["query_id"] not in queries:
            queries[point["query_id"]] = point["query"]
        query2doc2target[point["query_id"]][point["product_id"]] = point["esci_label"]
    return docs, DatasetPart(queries, query2doc2target)

In [None]:
data = {
    name: process(raw_data[name])
    for name in raw_data
}

In [None]:
text_rich_docs = data["train"][0]
text_rich_docs.update(data["test"][0])

data = {
    name: data[name][1]
    for name in data
}

In [None]:
def check_dataset(query2doc2target):
    print(f"Число запросов: {len(query2doc2target)}")
    print(f"Число точек: {sum(map(len, query2doc2target.values()))}")

    plt.figure(figsize=(10, 5))
    plt.hist(list(itertools.chain.from_iterable([doc2target.values() for doc2target in query2doc2target.values()])))
    plt.title("Распределение таргетов")
    plt.show()

In [None]:
check_dataset(data["train"].query2doc2target)

In [None]:
check_dataset(data["test"].query2doc2target)

In [None]:
def splitDict(d):
    n = len(d) // 2
    p = list(d.items())

    random.seed(42)
    random.shuffle(p)

    i = iter(p)
    d1 = dict(itertools.islice(i, n))
    d2 = dict(i)
    return d1, d2

In [None]:
indexed_queries, train_queries = splitDict(data["train"].queries)
indexed = DatasetPart(indexed_queries, data["train"].query2doc2target)
train = DatasetPart(train_queries, data["train"].query2doc2target)

test = data["test"]

In [None]:
indexed_docs = set(itertools.chain.from_iterable(indexed.query2doc2target[query].keys() for query in indexed.queries))
train_docs = set(itertools.chain.from_iterable(train.query2doc2target[query].keys() for query in train.queries))
test_docs = set(itertools.chain.from_iterable(test.query2doc2target[query].keys() for query in test.queries))

print(f"Число документов в indexed: {len(indexed_docs)}")
print(f"Число документов в train: {len(train_docs)}")
print(f"Число документов в test: {len(test_docs)}")

print(f"Покрытие train-части: {len(indexed_docs.intersection(train_docs))}")
print(f"Покрытие test-части: {len(indexed_docs.union(train_docs).intersection(test_docs))}")

#### Behaviour Features

In [None]:
@dataclass
class BehaviourFeatures:
    exact: float = 0.0
    substitute: float = 0.0
    complement: float = 0.0
    irrelevant: float = 0.0

    def __iter__(self):
        for feature in self.__dict__:
            yield self.__dict__[feature]

    def __add__(self, other):
        t1 = tuple(self)
        t2 = tuple(other)
        new = BehaviourFeatures(*(x+y for x, y in zip(t1, t2)))
        return new

In [None]:
def obtain_behavior_features(part) -> defaultdict[str, BehaviourFeatures]:
    doc2features = defaultdict(BehaviourFeatures)
    for query in tqdm(part.queries):
        doc2target = part.query2doc2target[query]
        for doc in doc2target:
            match doc2target[doc]:
                case "Exact":
                    doc2features[doc].exact += 1
                    doc2features[doc].exact_complement += 1
                case "Complement":
                    doc2features[doc].complement += 1
                    doc2features[doc].exact_complement += 1
                case "Substitute":
                    doc2features[doc].substitute += 1
    return doc2features

In [None]:
indexed_behaviour_docs = obtain_behavior_features(indexed)
train_behaviour_docs = obtain_behavior_features(train)

In [None]:
list(train_behaviour_docs.items())[:5]

In [None]:
def merge_features(p1: dict[str, BehaviourFeatures], p2: dict[str, BehaviourFeatures]) -> dict[str, BehaviourFeatures]:
    doc2features = defaultdict(BehaviourFeatures)
    for doc in p1:
        doc2features[doc] += p1[doc]
    for doc in p2:
        doc2features[doc] += p2[doc]
    return dict(doc2features)

In [None]:
indexed_train_behaviour_docs = merge_features(indexed_behaviour_docs, train_behaviour_docs)
list(indexed_train_behaviour_docs.items())[:5]

### FullText Search - OpenSearch Indexing

[Quickstart](https://last9.io/blog/how-to-use-opensearch-with-python/)

[OpenSearch Docs](https://docs.opensearch.org/docs/latest/field-types/supported-field-types/index/)

In [None]:
client = AsyncOpenSearch(
    hosts = [{'host': 'localhost', 'port': 9200}],
    http_auth = ('admin', os.environ["OPENSEARCH_INITIAL_ADMIN_PASSWORD"]),
    use_ssl = True,
    verify_certs = False,
    ssl_show_warn = False
)
info = await client.info()
info

In [None]:
def get_mappings():
    mappings = {
        "properties": {
            name: {"type": "text"}
            for name in TextFields.__annotations__
        }
    }
    
    mappings["properties"].update({
        name: {"type": "float"}
        for name in BehaviourFeatures.__annotations__
    })
    return mappings

In [None]:
index_name = "products"


index_body = {
    "settings": {
        "index": {
            "number_of_shards": 4,
            "number_of_replicas": 1
        }
    },
    "mappings": get_mappings()
}

if not await client.indices.exists(index=index_name):
    await client.indices.create(
        index=index_name, 
        body=index_body
    )

In [None]:
def obtain_doc2index_id(docs):
    doc2index_id = dict()
    for doc in docs:
        if doc not in doc2index_id:
            doc2index_id[doc] = len(doc2index_id)
    return doc2index_id

def obtain_index_id2doc(doc2index_id):
    return {value: key for key, value in doc2index_id.items()}

In [None]:
doc2index_id = obtain_doc2index_id(text_rich_docs)
index_id2doc = obtain_index_id2doc(doc2index_id)

In [None]:
def describe_document(
    doc: str,
    text_rich_docs: dict,
    behaviour_docs: dict
) -> dict:
    fields = dict(text_rich_docs[doc].__dict__)
    fields.update(behaviour_docs[doc].__dict__)
    return fields

In [None]:
await asyncio.gather(*[
    client.index(
        index=index_name,
        body=describe_document(doc, text_rich_docs, indexed_behaviour_docs),
        id=doc2index_id[doc],
        timeout=3600
    ) for doc in tqdm(doc2index_id)
])
await client.indices.refresh(index=index_name)

In [None]:
info = await client.indices.stats(index=index_name)
info["_all"]["primaries"]["docs"]["count"]

#### Queries and Metrics  
[OpenSearch Queries](https://docs.opensearch.org/docs/latest/query-dsl/full-text/match/)

In [None]:
def obtain_opensearch_query(
    query_text: str, 
    k: int = 15
):
    text_fields = list(TextFields.__annotations__.keys())

    return {
        "size": k,
        "query": {
            "function_score": {
                "query": {
                    "multi_match": {
                        "query": query_text,
                        "fields": ["product_title", "product_text"]
                    },
                    "script_score": {
                        "script": {
                            "params": {
                                "add": 1,
                                "mult": 2
                            },
                            "source": "_score * Math.log(params.add + params.mult * doc['exact'].value + doc['complement'].value)"
                        }
                    }
                }
            }
        }
    }

In [None]:
async def obtain_opensearch_serps(
    client: AsyncOpenSearch,
    query_text: str,
    index_id2doc: dict[int, str]
) -> list[dict]:
    query_body = obtain_opensearch_query(query_text)
    response = await client.search(index="products", body=query_body)
    serp = []
    for doc_info in response["hits"]["hits"]:
        doc_info["doc"] = index_id2doc[int(doc_info["_id"])]
        serp.append(doc_info)
    return serp

In [None]:
def get_dcg(scores):
    scores = np.asarray(scores)
    return np.sum((2 ** scores - 1) / np.log2(np.arange(2, len(scores)+2)))

def get_metrics(serp, gt):
    scores = []
    for doc in serp:
        score = 0
        if doc["doc"] in gt:
            match gt[doc["doc"]]:
                case "Exact":
                    score = 3
                case "Substitute":
                    score = 2
                case "Complement":
                    score = 1
        scores.append(score)
    return {"dcg": get_dcg(scores)}

In [None]:
async def evaluate_opensearch(client, test, index_id2doc):
    total = defaultdict(float)
    for query, query_text in (pbar := tqdm(test.queries.items())):
        serp = await obtain_opensearch_serps(client, query_text, index_id2doc)
        metrics = get_metrics(serp, test.query2doc2target[query])
        for name in metrics:
            total[name] += metrics[name]
        pbar.set_postfix(metrics)
    for name in total:
        total[name] /= len(test.queries)
    return total

In [None]:
await evaluate_opensearch(client, test, index_id2doc)

### Vector Search - Qdrant 

In [None]:
embedder = BGEM3FlagModel(
    'BAAI/bge-m3-unsupervised', 
    use_fp16=True, 
    devices=[0]
)

def obtain_embeddings(texts, batch_size=64, max_length=512):
    embeddings = embedder.encode(texts, batch_size=batch_size, max_length=max_length)['dense_vecs']
    return embeddings

In [None]:
def obtain_doc2embedding(text_rich_docs, field):
    docs = text_rich_docs.keys()
    texts = list(map(lambda doc: text_rich_docs[doc].__dict__[field], docs))
    embeddings = obtain_embeddings(texts)
    return dict(zip(docs, embeddings))

In [None]:
doc2_title_embedding = obtain_doc2embedding(text_rich_docs, "product_title")
doc2_text_embedding = obtain_doc2embedding(text_rich_docs, "product_text")

### Indexing 

[Quickstart](https://qdrant.tech/documentation/quickstart/)

In [None]:
client = QdrantClient(url="http://localhost:6333")

client.create_collection(
    collection_name="title",
    vectors_config=VectorParams(size=1024, distance=Distance.DOT),
)

client.create_collection(
    collection_name="text",
    vectors_config=VectorParams(size=1024, distance=Distance.DOT),
)

In [None]:
operation_info = client.upsert(
    collection_name="test_collection",
    wait=True,
    points=[
        PointStruct(id=1, vector=[0.05, 0.61, 0.76, 0.74], payload={"city": "Berlin"}),
        PointStruct(id=2, vector=[0.19, 0.81, 0.75, 0.11], payload={"city": "London"}),
        PointStruct(id=3, vector=[0.36, 0.55, 0.47, 0.94], payload={"city": "Moscow"}),
        PointStruct(id=4, vector=[0.18, 0.01, 0.85, 0.80], payload={"city": "New York"}),
        PointStruct(id=5, vector=[0.24, 0.18, 0.22, 0.44], payload={"city": "Beijing"}),
        PointStruct(id=6, vector=[0.35, 0.08, 0.11, 0.44], payload={"city": "Mumbai"}),
    ],
)

print(operation_info)

In [None]:
search_result = client.query_points(
    collection_name="test_collection",
    query=[0.2, 0.1, 0.9, 0.7],
    query_filter=Filter(
        must=[FieldCondition(key="city", match=MatchValue(value="London"))]
    ),
    with_payload=True,
    limit=3,
).points

print(search_result)

### Задание 7. Отбор кандидатов - 8 балла

В этом задании вам необходимо объединить выдачи разных источников, в том числе разных индексов в QDrant, в одну выдачу. Воспринимайте их независимыми, берите фиксированное число документов из каждого источника, как если бы они были равнозначны. Дедублицируйте выдачу.

In [None]:
# ---- Ваш код здесь ----
# ---- Конец кода ----

### Задание 8. Оценка качества - 5 балл

Отранжируйте полученные результаты по формуле, основанной на оценку из OpenSearch и QDrant. Например, можно взять линейную комбинацию этих чисел для каждого документа и отранжировать по ней.

In [None]:
# ---- Ваш код здесь ----
# ---- Конец кода ----

## Часть 3. Гибридный поиск

### Задание 9. Модель переранжирования - 8 баллов

Обучите Catboost на train-срезе на признаках из OpenSearch и нейропризнаке эмбеддера на основе тех текстов, которые вы индексировали в QDrant. Признак считайте как скалярное произведение между вектором запроса и вектором текста, привязанного к документу.

In [None]:
# ---- Ваш код здесь ----
# ---- Конец кода ----

### Задание 10. Оценка качества - 2 балла

Замерьте конечное качество системы.

In [None]:
# ---- Ваш код здесь ----
# ---- Конец кода ----

---

In [None]:
!docker stop $(docker ps -a -q)

---