In [30]:
import pandas as pd
from gensim.models import KeyedVectors

# Пути к файлам и соответствующие названия колонок
file_paths = {
    'glove': '../glove_python/glove.kv',
    'word2vec': '../word2vec/word2vec.kv'
}

word2vec = KeyedVectors.load(file_paths['word2vec'],  mmap='r')
glove = KeyedVectors.load(file_paths['glove'],  mmap='r')

In [31]:
import dask.dataframe as dd
import dask.bag as db
docs = dd.read_parquet("../output.pq/")
texts = docs['News_Tokens'].compute()
bag = db.from_sequence(texts)
list_news = bag.map(lambda sent: ' '.join(sent)).compute()
ids = docs['News_Id'].compute()

In [32]:
import numpy as np

def get_doc_embedding(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [34]:
glove_embeddings = {}
word2vec_embeddings = {}

for (news, id) in zip(list_news, ids):
  word2vec_embeddings[id] = get_doc_embedding(news, word2vec)
  glove_embeddings[id] = get_doc_embedding(news, glove)

In [38]:
import numpy as np
import json
import faiss

folder = 'news-embeddings'
# Словари → списки → массивы и id-шники
def save_embeddings_dict(emb_dict, prefix):
    ids = list(emb_dict.keys())
    vectors = np.array([emb_dict[i] for i in ids], dtype='float32')
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors)
    np.save(f"{folder}/{prefix}_vectors.npy", vectors)
    faiss.write_index(index, f"{folder}/{prefix}.index")

    with open(f"{folder}/{prefix}_ids.json", "w", encoding="utf-8") as f:
        json.dump(ids, f)

save_embeddings_dict(glove_embeddings, "glove")
save_embeddings_dict(word2vec_embeddings, "word2vec")