In [None]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from pathlib import Path
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('cointegrated/rubert-tiny2')

In [3]:
index = faiss.read_index("embeddings/embeddings.index")  # read index from file

In [2]:
embeddings = np.load("embeddings/embeddings.npy", allow_pickle=True)

In [3]:
embeddings.shape

(49727, 312)

In [None]:
chunk_files = sorted(Path('.').glob('dataset/preproc_data*.parquet'))
dfs = [pd.read_parquet(f) for f in chunk_files]
df = pd.concat(dfs, ignore_index=True)

# Ranking metrics functions

In [11]:
def precision_at_k(relevant, k):
    return np.sum(relevant[:k]) / k

def recall_at_k(relevant, total_relevant, k):
    if total_relevant == 0:
        return 0.0
    return np.sum(relevant[:k]) / total_relevant


def hits_at_k(relevant, k):
    return 1.0 if np.sum(relevant[:k]) > 0 else 0.0

def mrr(relevant):
    for idx, rel in enumerate(relevant, 1):
        if rel:
            return 1.0 / idx
    return 0.0

def dcg(relevant, k):
    return np.sum(relevant[:k] / np.log2(np.arange(2, k + 2)))

def ndcg_at_k(relevant, k):
    ideal_relevant = np.sort(relevant[::-1])
    idcg = dcg(ideal_relevant, k)
    if idcg == 0:
        return 0.0
    return dcg(relevant, k) / idcg

def average_precision_at_k(relevant, k):
    hits = 0
    sum_precisions = 0.0
    for i in range(k):
        if relevant[i]:
            hits += 1
            sum_precisions += hits / (i + 1)
    if hits == 0:
        return 0.0
    return sum_precisions / hits

In [15]:
def evaluate_index(index, embeddings, df, k=10, n_eval=100):
    precisions = []
    recalls = []
    hits = []
    mrrs = []
    ndcgs = []
    aps = []

    class_counts = df['classifierByIPS'].value_counts().to_dict()

    for _ in range(n_eval):
        # choose random query from embedding
        i = np.random.randint(0, len(embeddings))
        query = embeddings[i].reshape(1, -1)
        query_class = df.iloc[i]['classifierByIPS']

        if not isinstance(query_class, str) or query_class == "UNKNOWN":
            continue

        _, topk = index.search(query, k+1)
        topk = topk[0][1:]  # except the same one

        topk_classes = df.iloc[topk]['classifierByIPS'].values
        relevant = (topk_classes == query_class).astype(int)

        precisions.append(precision_at_k(relevant, k))
        total_relevant = class_counts.get(query_class, 0) - 1
        total_relevant = max(total_relevant, 0)
        recalls.append(recall_at_k(relevant, total_relevant, k))
        hits.append(hits_at_k(relevant, k))
        mrrs.append(mrr(relevant))
        ndcgs.append(ndcg_at_k(relevant, k))
        aps.append(average_precision_at_k(relevant, k))

    print("Evaluation results:")
    print(f"Precision@{k}: {np.mean(precisions):.3f}")
    print(f"Recall@{k}: {np.mean(recalls):.3f}")
    print(f"Hits@{k}:     {np.mean(hits):.3f}")
    print(f"MRR:          {np.mean(mrrs):.3f}")
    print(f"NDCG@{k}:     {np.mean(ndcgs):.3f}")
    print(f"MAP@{k}:      {np.mean(aps):.3f}")

In [None]:
evaluate_index(index, embeddings, df, 10, 100)

Evaluation results:
Precision@10: 0.107
Recall@10: 0.034
Hits@10:     0.259
MRR:          0.214
NDCG@10:     0.509
MAP@10:      0.196


: 

In [None]:
import time

def measure_faiss_speed(index, embeddings, n_queries=100):
    total_time = 0.0
    for _ in range(n_queries):
        i = np.random.randint(0, len(embeddings))
        query = embeddings[i].reshape(1, -1)
        start = time.time()
        _ = index.search(query, 10)
        total_time += time.time() - start
    avg_time_ms = (total_time / n_queries) * 1000
    return avg_time_ms
measure_faiss_speed(index, embeddings)

7.3235719203948975

In [None]:
# Real query test

In [None]:
query = "имущество аэропорта"
query_vec = model.encode([query])
query_vec = query_vec / np.linalg.norm(query_vec)

faiss.normalize_L2(query_vec)

# Поиск ближайших
k = 50  # Больше, чтобы потом отфильтровать

distances, indices = index.search(query_vec, k)

grouped = defaultdict(list)
for i, dist in zip(indices[0], distances[0]):
    if i == -1: continue
    row = df.iloc[i]
    classifier = str(row['classifierByIPS']).strip() if pd.notna(row['classifierByIPS']) else "UNKNOWN"
    grouped[classifier].append((i, dist))

# 5. Вывод — по 3 документа на класс
print(f"\n🔍 Результаты по запросу: '{query}' (группировка по classifierByIPS):\n")
for cls, items in grouped.items():
    print(f"\n📂 Класс: {cls} (всего {len(items)} документов):")
    for i, dist in items[:3]:
        print(f"  ↪ Сходство: {dist:.3f} | ID: {i}")
        print(df.iloc[i]['textIPS'][:200], "\n")

# ids, distances = index.knnQuery(query_vec, k=k)

# # Постобработка: фильтрация по классу
# results = []
# for idx, dist in zip(ids, distances):
#     row = df.iloc[idx]
#     classifier = row['classifierByIPS']
#     if classifier and isinstance(classifier, str) and classifier.strip().lower() != "unknown":
#         similarity = 1 - dist  # преобразуем обратно в cosine similarity
#         results.append({
#             "similarity": similarity,
#             "classifier": classifier.strip(),
#             "text": row['textIPS'][:300],  # обрезаем текст
#             "full_index": idx
#         })

# # 8. Сортировка и вывод
# top_filtered = sorted(results, key=lambda x: -x['similarity'])[:10]

# print(f"Топ-10 документов, отфильтрованных по classifierByIPS (не пустой, не unknown):\n")
# for i, res in enumerate(top_filtered):
#     print(f"— #{i+1} | Сходство: {res['similarity']:.3f} | Класс: {res['classifier']}")
#     print(res['text'], "\n")
