# 1. import thư viện

In [None]:
import numpy as np
import sys
from scipy.spatial.distance import cdist

In [None]:
W2V_PATH = r"D:\Khóa_Luận_Tốt_Nghiệp\MGEEMS\Test\Code\word2vec_vi_words_100dims.txt"
GLOSS_PATH = r"D:\Khóa_Luận_Tốt_Nghiệp\MGEEMS\Test\Data\gloss.txt"
OUTPUT_PATH = r"ketqua_gloss_100dim.txt"
EMBEDDING_DIM = 100

# 2.Word Embedding

In [None]:
def load_w2v_model(path, dim):
    print(f"--> Đang tải model từ: {path}")
    mapping = {}
    try:
        with open(path, 'r', encoding="utf-8") as f:
            first_line = f.readline().strip().split()
            if len(first_line) > 2:
                f.seek(0)
            count = 0
            for line in f:
                parts = line.strip().split()
                if len(parts) != dim + 1:
                    continue
                word = parts[0].replace(" ", "_")
                try:
                    vec = np.array(parts[1:], dtype=float)
                    mapping[word] = vec
                    count += 1
                except ValueError:
                    continue
        print(f"--> Đã tải thành công {count} từ (Dimension: {dim}).")
        return mapping
    except FileNotFoundError:
        print(f"LỖI: Không tìm thấy file tại {path}")
        sys.exit(1)

def get_word_vector(word, model, dim):
    word = word.replace("-", "_")
    if word in model:
        return model[word]
    sub_words = word.split("_")
    combined_vec = np.zeros(dim)
    found = False
    for sub in sub_words:
        if sub in model:
            combined_vec += model[sub]
            found = True

    if found:
        return combined_vec
    return np.repeat(0.0001, dim)

def sentence_to_matrix(sentence, model, dim):
    words = sentence.strip().split()
    if not words: return None
    return np.array([get_word_vector(w, model, dim) for w in words])



# 3. Công thức cross word similarity

In [None]:
def compute_cross_word_similarity(sen1, sen2, model, dim):
    V1 = sentence_to_matrix(sen1, model, dim) # Shape: (n, 100)
    V2 = sentence_to_matrix(sen2, model, dim) # Shape: (m, 100)
    if V1 is None or V2 is None: return 0.0
    try:
        dists = cdist(V1, V2, 'cosine')
    except Exception:
        return 0.0
    sim_matrix = (2 - dists) / 2
    score_1 = np.mean(np.max(sim_matrix, axis=1))
    score_2 = np.mean(np.max(sim_matrix, axis=0))
    return (score_1 + score_2) / 2

# 4. Chạy thử nghiệm

In [None]:
def main():
    model = load_w2v_model(W2V_PATH, EMBEDDING_DIM)
    try:
        with open(GLOSS_PATH, 'r', encoding='utf-8') as f:
            lines = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"LỖI: Không tìm thấy file {GLOSS_PATH}")
        return
    results = []
    print("\nBắt đầu tính toán...")
    # Duyệt từng cặp: (0,1), (2,3), (4,5)...
    for i in range(0, len(lines) - 1, 2):
        s1 = lines[i]
        s2 = lines[i+1]

        score = compute_cross_word_similarity(s1, s2, model, EMBEDDING_DIM)
        print(f"Cặp {i//2 + 1}: {score:.4f}")
        results.append(f"{s1}|{s2}|{score:.6f}")

    with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
        for line in results:
            f.write(line + "\n")
    print(f"\n--> Hoàn tất! File kết quả: {OUTPUT_PATH}")
if __name__ == "__main__":
    main()