# Dùng mô hình phoBert-v2 để tạo embedding cho từ

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from underthesea import word_tokenize
from tqdm import tqdm

input_file = r"D:\Semantic-Concept-Similarity\data\ BabelNet_combine_WordNet\BCW_Word.txt"
output_file = r"D:\Semantic-Concept-Similarity\data\Embedding_BCW\Embedding_Word.txt"

model_name = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


def get_embedding(word):
    # tach tu
    text_seg = word_tokenize(word, format="text")

    inputs = tokenizer(text_seg, return_tensors="pt", truncation=True, padding=True, max_length=32)

    # Lấy vector
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1)
    # outputs.last_hidden_state.mean(dim=1) # Mean pooling theo chiều token
    # outputs.last_hidden_state[:, 0, :] # [CLS] embedding
    # outputs.last_hidden_state.squeeze(0) # Token-level embedding
    return embedding.squeeze(0).cpu().numpy()

with open(input_file, "r", encoding="utf-8") as f:
    words = [line.strip() for line in f if line.strip()]

print(f" {len(words)} ")


with open(output_file, "w", encoding="utf-8") as out:
    for word in tqdm(words, desc="Đang xử lý"):
        try:
            vec = get_embedding(word)
            out.write(" ".join([f"{v:.6f}" for v in vec]) + "\n")
        except Exception as e:
            print(f"Lỗi ở từ: {word} -> {e}")



Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 602 


Đang xử lý: 100%|██████████| 602/602 [00:12<00:00, 49.04it/s]


# Similarity Word


In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
input_file = "D:\Semantic-Concept-Similarity\data\Embedding_BCW\Embedding_Word.txt"
embeddings = []

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        values = list(map(float, line.strip().split()))
        embeddings.append(values)
embeddings = np.array(embeddings) # Lưu vào mảng np

similarities = []
for i in range(0, len(embeddings), 2):
    if i + 1 < len(embeddings):
        sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
        similarities.append((i, i + 1, sim))

output_file = "D:\Semantic-Concept-Similarity\data\Similarity\Sim_Word.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for (i, j, sim) in similarities:
        f.write(f"{sim}\n")


  input_file = "D:\Semantic-Concept-Similarity\data\Embedding_BCW\Embedding_Word.txt"
  output_file = "D:\Semantic-Concept-Similarity\data\Similarity\Sim_Word.txt"
