In [None]:
import numpy as np
import json
import faiss

def load_index_and_ids(prefix: str, folder: str = "news-embeddings"):
    # Загрузка FAISS индекса
    index = faiss.read_index(f"{folder}/{prefix}.index")
    
    # Загрузка соответствующих ID
    with open(f"{folder}/{prefix}_ids.json", "r", encoding="utf-8") as f:
        ids = json.load(f)
    
    return index, ids
db = {
  "glove": load_index_and_ids("glove"),
  "word2vec": load_index_and_ids("word2vec")
}

In [None]:
from gensim.models import KeyedVectors
models = {
  "glove": KeyedVectors.load("../glove_python/glove.kv"),
  "word2vec": KeyedVectors.load("../word2vec/word2vec.kv")
}

In [None]:
# Считывание новостей
import dask.dataframe as dd
docs = dd.read_parquet("../output.pq/", columns=['News_Id', 'News_Title', 'News_Tokens'])
df = docs.compute()
id_to_title = dict(zip(df['News_Id'], df['News_Title']))

In [None]:
import pymorphy3
import re
def lematization(f_input_list):          # Лематизация слов в списке
    morph = pymorphy3.MorphAnalyzer()
    lnorm = list()
    for word in f_input_list:
        p = morph.parse(word)[0]
        lnorm.append(p.normal_form)
    return (lnorm)
def preprocess_query(query):
    query = query.lower()
    query = re.sub(r"[^\w\s]", " ", query)
    query = re.sub(r'\s{2,}', ' ', query)
    tokens = query.split()
    return lematization(tokens)
def get_query_embedding(query: str, model_name: str):
    model: KeyedVectors = models[model_name]
    tokens = preprocess_query(query=query)
    vectors = [model[word] for word in tokens if word in model]
    if not vectors:
        print("Query vector is empty")
        return np.zeros((1, model.vector_size), dtype='float32')
    return np.mean(vectors, axis=0).astype('float32').reshape(1, -1)

In [None]:
def semantic_search(query: str, model_name: str, k = 5):
    query_vec = get_query_embedding(query, model_name)
    if np.linalg.norm(query_vec) == 0:
        return []
    index, ids = db[model_name]
    D, I = index.search(query_vec, k)
    news_ids = [ids[i] for i in I[0]]
    return news_ids

In [None]:
text = "Чемпионат мира по программированию"
print(f"Запрос: {text[:50]}\n")
answers = semantic_search(query=text, model_name="word2vec")
answers = [id_to_title[i] for i in answers]
for answer in answers:
  print(answer)

# Сравнение моделей через тестовую выборку

In [None]:
# Модель выдает новость по запросу в топ K новостей
def is_news_at_k_most(query: str, expected_id: int, model_name: str, k: int) -> bool:
  news_id = semantic_search(query=query, model_name=model_name, k=k)
  return any(id == expected_id for id in news_id)

In [None]:
import ast
import dask.dataframe as dd
import dask.bag as bag

test_data = dd.read_csv("query-combined.csv/*.part").loc[:1]

def safe_parse(queries_str):
    try:
        return ast.literal_eval(queries_str)
    except Exception:
        return []

test_data = test_data.assign(News_Query_Parsed=test_data['News_Query'].map(safe_parse, meta=('News_Query_Parsed', 'object')))
  
def count_true_partition(df, model_name, k):
    def count_row(row):
        queries = row['News_Query_Parsed']
        return sum(is_news_at_k_most(q, row['News_Id'], model_name, k) for q in queries)
    df['count_true'] = df.apply(count_row, axis=1)
    return df

k = 5
all_count = len(test_data)

meta = test_data._meta.assign(count_true = 0)
for model_name in models.keys():
    result = test_data.map_partitions(count_true_partition, model_name, k, meta=meta).persist()
    
    result_df.to_csv(f"{model_name}_test_data.csv", single_file=True)

    count = result.sum().compute()
    print(f"model = {model_name} k = {k} count = {count} percent = {count / all_count * 100}%")
