In [None]:
%pylab inline
import pandas as pd
import sklearn as skl
import scipy
import math  
import numpy as np
from collections import defaultdict
import torch
from transformers import BertModel, BertTokenizerFast
from sklearn.metrics.pairwise import cosine_similarity

model_name = 'ai-forever/ruBert-base'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
document = pd.read_excel("../News_SGU_31077_Processed_1.xlsx")
texts = document["News_Tokens"]

In [None]:
print(texts)

In [5]:
from tqdm import tqdm
word_embedding = defaultdict(list)
texts = texts[:1]


max_length = 512  # Максимальная длина для модели
stride = 128      # Перекрытие между чанками для контекста

for text in tqdm(texts, desc="get embeddings"):
    # Токенизация с перекрытием (stride)
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=max_length, 
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding=True,
    )
    
    # Удаляем лишние поля, которые модель не принимает
    chunk_inputs = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "token_type_ids": inputs.get("token_type_ids")  # Опционально, если есть
    }
    # print(text)
    # Обрабатываем каждый чанк отдельно
    for i in range(inputs["input_ids"].shape[0]):
        # Подготавливаем входы для текущего чанка
        current_input = {
            k: v[i].unsqueeze(0) for k, v in chunk_inputs.items() if v is not None
        }
        
        with torch.no_grad():
            outputs = model(**current_input)
        
        embeddings = outputs.last_hidden_state[0].numpy()
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][i])
        offsets = inputs["offset_mapping"][i]
        
        # Собираем эмбеддинги только для первых вхождений токенов
        for j, (token, offset) in enumerate(zip(tokens, offsets)):
            if offset[0] == 0 and not token.startswith("##"):
                # print(token)
                word_embedding[token].append(embeddings[j])

print(len(word_embedding))
# print(word_embedding)
# Усреднение эмбеддингов для каждого слова
for word in tqdm(word_embedding, desc="mean embeddings"):
    word_embedding[word] = np.mean(word_embedding[word], axis=0)

get embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s]


3


mean embeddings: 100%|██████████| 3/3 [00:00<00:00, 305.23it/s]


In [None]:
print(embeddings)

In [None]:
words = list(word_embedding.keys())
print(words[:10])
print(word_embedding[words[0]])

In [11]:
embeddings_matrix = np.array([word_embedding[word] for word in words])

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(embeddings_matrix)

In [None]:
def get_top_similar_words(word, top_n=5):
    if word not in word_embedding:
        return []
    idx = words.index(word)
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_words = [(words[i], score) for i, score in sim_scores[1:top_n+1]]  # [1:] чтобы исключить само слово
    return top_words

print(get_top_similar_words("сгу"))
# Пример вывода: [("large", 0.92), ("huge", 0.89), ("enormous", 0.85), ...]

In [None]:
result = []
for word in tqdm(words, desc="get_sim_word"):
  result.append([word, get_top_similar_words(word)])
df_result = pd.DataFrame(result, columns=["Word", "Most_Similar_Word"])
df_result.to_csv("sim-words5-bert.csv", index=False)