# Embedding Sentence

### 1. So sánh từng cặp gloss 1 với nhau

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel
from underthesea import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- Load PhoBERT ---
model_name = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# --- Dữ liệu gloss ---
bien_glosses = [
    "Trong lịch sử toán học, biến số là một số có giá trị bất kỳ.",
    "Trong lập trình máy tính, một biến là vị trí lưu trữ gắn với tên tượng trưng.",
    "Kí hiệu như x hay y được sử dụng trong biểu thức toán học."
]
ngap_glosses = [
    "Trong hình học vi phân, phép ngập là ánh xạ khả vi giữa các đa tạp.",
    "Chảy vượt qua giới hạn hoặc bao phủ với ánh sáng hoặc nước."
]

# --- Hàm lấy embedding cho 1 gloss ---
def get_embedding(sentence):
    sentence = word_tokenize(sentence, format="text")
    input_ids = torch.tensor([tokenizer.encode(sentence)])
    with torch.no_grad():
        outputs = model(input_ids)[0]
        embedding = outputs.mean(dim=1).squeeze(0).numpy()  # mean pooling
    return embedding

# --- Encode tất cả gloss ---
bien_vecs = [get_embedding(g) for g in bien_glosses]
ngap_vecs = [get_embedding(g) for g in ngap_glosses]

# --- Tính cosine similarity giữa mọi cặp gloss ---
sims = []
for v1 in bien_vecs:
    for v2 in ngap_vecs:
        sim = cosine_similarity([v1], [v2])[0][0]
        sims.append(sim)

# --- Gộp kết quả ---
mean_sim = np.mean(sims)
max_sim = np.max(sims)

print("Các giá trị similarity giữa từng cặp gloss:")
for i, sim in enumerate(sims, 1):
    print(f"  Cặp {i}: {sim:.4f}")

print(f"Mean similarity: {mean_sim:.4f}")
print(f"Max similarity:  {max_sim:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Các giá trị similarity giữa từng cặp gloss:
  Cặp 1: 0.6713
  Cặp 2: 0.5992
  Cặp 3: 0.6801
  Cặp 4: 0.6750
  Cặp 5: 0.7035
  Cặp 6: 0.5612
Mean similarity: 0.6484
Max similarity:  0.7035


### 2. Gloss-level similarity (mean pooling)

In [7]:
import torch
from transformers import AutoTokenizer, AutoModel
from underthesea import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- Load model ---
model_name = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# --- Dữ liệu gloss ---
bien_glosses = [
    "Trong lịch sử toán học, biến số là một số có giá trị bất kỳ.",
    "Trong lập trình máy tính, một biến là vị trí lưu trữ gắn với tên tượng trưng.",
    "Kí hiệu như x hay y được sử dụng trong biểu thức toán học."
]
ngap_glosses = [
    "Trong hình học vi phân, phép ngập là ánh xạ khả vi giữa các đa tạp.",
    "Chảy vượt qua giới hạn hoặc bao phủ với ánh sáng hoặc nước."
]

# --- Hàm lấy embedding cho một gloss ---
def get_embedding(sentence):
    sentence = word_tokenize(sentence, format="text")
    input_ids = torch.tensor([tokenizer.encode(sentence)])
    with torch.no_grad():
        outputs = model(input_ids)[0]           # (batch, seq_len, hidden_dim)
        embedding = outputs.mean(dim=1).squeeze(0).numpy()  # mean pooling
    return embedding

# --- Lấy vector cho tất cả gloss ---
bien_vecs = np.array([get_embedding(g) for g in bien_glosses])
ngap_vecs = np.array([get_embedding(g) for g in ngap_glosses])

# --- Gloss-level aggregation (Mean Pooling) ---
v_bien = np.mean(bien_vecs, axis=0, keepdims=True)
v_ngap = np.mean(ngap_vecs, axis=0, keepdims=True)

# --- Tính cosine similarity giữa hai từ ---
sim = cosine_similarity(v_bien, v_ngap)[0][0]
print(f"Gloss-level similarity (mean pooling): {sim:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gloss-level similarity (mean pooling): 0.7798


# 3. chưa tách từ

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

# Tai model phobertv2
model_name = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

input = r"D:\Semantic-Concept-Similarity\data\ BabelNet_combine_WordNet\BCW_Definitions.txt"
output = r"D:\Semantic-Concept-Similarity\data\Embedding_BCW\Embedding_gloss.txt"

def get_embedding(text):
    inputs=tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.squeeze(0).cpu().numpy()

with open(input ,"r",encoding="utf-8") as f:
    gloss_lines = [line.strip() for line in f if line.strip()]

embeddings = []

for i,line in enumerate(gloss_lines,1):
    gloss_list =re.findall(r"'(.*?)'",line)
    if not gloss_list:
        embeddings.append(0)
        continue

    gloss_vec=[get_embedding(g) for g in gloss_list]

    mean_vec = np.mean(gloss_vec, axis=0)
    embeddings.append(mean_vec)

with open(output ,"w",encoding="utf-8") as out:
     for vec in embeddings:
         out.write(" ".join([f"{v}" for v in vec]) +"\n")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
v1= embeddings[0]

v2= embeddings[1]

sim = cosine_similarity([v1], [v2])[0][0]
print(f"Similarity  {sim:.4f}")

Similarity  0.9059


# 4. Tách từ sinh viên => sinh_viên

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
from underthesea import word_tokenize
# Tai model phobertv2
model_name = "vinai/phobert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

input = r"D:\Semantic-Concept-Similarity\data\ BabelNet_combine_WordNet\BCW_Definitions.txt"
output = r"D:\Semantic-Concept-Similarity\data\Embedding_BCW\Embedding_gloss2.txt"

def get_embedding(text):
    text_seg = word_tokenize(text, format="text")

    inputs=tokenizer(
        text_seg,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)

    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding.squeeze(0).cpu().numpy()

with open(input ,"r",encoding="utf-8") as f:
    gloss_lines = [line.strip() for line in f if line.strip()]

embeddings = []

for i,line in enumerate(gloss_lines,1):
    gloss_list =re.findall(r"'(.*?)'",line)
    if not gloss_list:
        embeddings.append(0)
        continue

    gloss_vec=[get_embedding(g) for g in gloss_list]

    mean_vec = np.mean(gloss_vec, axis=0)
    embeddings.append(mean_vec)

with open(output ,"w",encoding="utf-8") as out:
     for vec in embeddings:
         out.write(" ".join([f"{v}" for v in vec]) +"\n")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
v1= embeddings[0]

v2= embeddings[1]

sim = cosine_similarity([v1], [v2])[0][0]
print(f"Similarity  {sim:.4f}")

Similarity  0.9072


# Tính Similarity Gloss

In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
input_file = "D:\Semantic-Concept-Similarity\data\Embedding_BCW\Embedding_gloss2.txt"
embeddings = []

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        values = list(map(float, line.strip().split()))
        embeddings.append(values)
embeddings = np.array(embeddings) # Lưu vào mảng np

similarities = []
for i in range(0, len(embeddings), 2):
    if i + 1 < len(embeddings):
        sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
        similarities.append((i, i + 1, sim))

output_file = "/data/Similarity/Sim_Gloss.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for (i, j, sim) in similarities:
        f.write(f"{sim}\n")


  input_file = "D:\Semantic-Concept-Similarity\data\Embedding_BCW\Embedding_gloss2.txt"
  output_file = "D:\Semantic-Concept-Similarity\data\Similarity\similarities.txt"
