In [1]:
from py_vncorenlp import VnCoreNLP
import numpy as np 
from collections import defaultdict
from tqdm import tqdm

In [2]:
model = VnCoreNLP(
    save_dir=r"C:\Users\Admin\Documents\2024.2\NLP\Project\VnCoreNLP-master",
    max_heap_size='-Xmx2g',
    annotators=["wseg"]
)

In [3]:
import json
import re
# [
# {
#     "law_id": "01/2009/tt-bnn",
#     "articles": [
#         {
#             "article_id": "1",
#             "title": "Điều 1. Phạm vi áp dụng",
#             "text": "Thông tư này hướng dẫn tuần tra, canh gác bảo vệ đê Điều trong mùa lũ đối với các tuyến đê sông được phân loại, phân cấp theo quy định tại Điều 4 của Luật Đê Điều.",
#             "processed_in4": "Phạm_vi áp_dụng Thông_tư hướng_dẫn tuần_tra , canh_gác bảo_vệ đê Điều mùa lũ tuyến đê sông phân_loại , phân_cấp quy_định Điều 4 Luật Đê_Điều ."
#         },

sentences =[]
with open(r"C:\Users\Admin\Documents\2024.2\NLP\Project\dataset\legal_corpus.json", encoding = 'utf-8') as f:
    data = json.load(f)
    for doc in data:
        for article in doc['articles']:
            tokens = re.sub(r'[^\w\s_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơưăạ-ỹ]', '', article['processed_in4']).split()
            sentences.append(tokens)

In [4]:
print(sentences[1])

['Tổ_chức', 'lực_lượng', '1', 'Hàng', 'mùa', 'mưa', 'lũ', 'Uỷ_ban_nhân_dân', 'xã', 'đê', 'tổ_chức', 'lực_lượng', 'lao_động', 'địa_phương', 'tuần_tra', 'canh_gác', 'đê', 'thường_trực', 'điếm_canh', 'đê', 'dân', 'khu_vực', 'đê', 'khu_vực', 'điếm_canh', 'đê', 'báo_động', 'lũ', 'I', 'trở', 'tuyến', 'sông', 'đê', 'gọi', 'tắt', 'lực_lượng', 'tuần_tra', 'canh_gác', 'đê', '2', 'Lực_lượng', 'tuần_tra', 'canh_gác', 'đê', 'tổ_chức', 'thành', 'đội', 'Uỷ_ban_nhân_dân', 'xã', 'quyết_định', 'thành_lập', '01', '02', 'kilômét', 'đê', 'thành_lập', '01', 'đội', 'đội', '12', '18', '01', 'đội_trưởng', '01', '02', 'đội', 'phó', 'Danh_sách', 'thành_viên', 'đội', 'tuần_tra', 'canh_gác', 'đê', 'niêm_yết', 'điếm_canh', 'đê', 'địa_bàn', 'phân_công', '3', 'Khi', 'lũ', 'bão', 'diễn_biến', 'phức_tạp', 'kéo_dài', 'Uỷ_ban_nhân_dân', 'xã', 'quyết_định', 'bổ_sung', 'thành_viên', 'đội', 'tuần_tra', 'canh_gác', 'đê']


# 1. Coccurence - Matrix

In [5]:
def Vocab(sentences, min_count = 1):
    vocab = defaultdict(int)
    for sentence in sentences:
        for word in sentence:
            vocab[word] += 1
    # Only keep work with the occurence > min_count = 1
    vocab = {w : i for i, (w,c) in enumerate(vocab.items()) if c >= min_count}
    return vocab

In [6]:
def cocurrence_matrix(sentences, vocab, window_size = 5):
    matrix = defaultdict(lambda : defaultdict(float))
    for sentence in tqdm(sentences):
        sentence = [w for w in sentence if w in vocab]
        for i, word in enumerate(sentence):
            for j in range(max(0, i - window_size), min(len(sentence) - 1, i + window_size)):
                if i != j:
                    word_i = vocab[word] #map: word -> int
                    word_j = vocab[sentence[j]]
                    distance = abs(i - j)
                    matrix[word_i][word_j] += 1.0/distance #if the word is nearer, it is more important
    return matrix

# 2. Vector initialization

In [7]:
def initialize_params(vocab_size, embedding_dim):
    # we have vocab_size word, each word is represented by a vector shape (embedding_dim)
    w = np.random.rand(vocab_size, embedding_dim)/ np.sqrt(embedding_dim)
    w_tilde = np.random.rand(vocab_size, embedding_dim)/ np.sqrt(embedding_dim)
    b = np.zeros(vocab_size)
    b_tilde = np.zeros(vocab_size)

    # Gradient accumulators (AdaGrad)
    gradsq_w = np.ones_like(w)
    gradsq_w_tilde = np.ones_like(w_tilde)
    gradsq_b = np.ones_like(b)
    gradsq_b_tilde = np.ones_like(b_tilde)

    return w, w_tilde, b, b_tilde, gradsq_w, gradsq_w_tilde, gradsq_b, gradsq_b_tilde


In [8]:
def weighting_fn(x, x_max = 100, alpha = 0.75):
    return (x/x_max)**alpha if x < x_max else 1

In [9]:
import math

# 3. Training

In [10]:
def train_glove(matrix, vocab_size, embedding_dim=50, epochs=25, x_max=100, alpha=0.75, learning_rate=0.05, patience=3):
    W, W_tilde, b, b_tilde, gradsq_W, gradsq_W_tilde, gradsq_b, gradsq_b_tilde = initialize_params(vocab_size, embedding_dim)

    best_loss = float('inf')
    best_params = None
    epochs_without_improvement = 0

    for epoch in range(epochs):
        total_loss = 0
        for i in tqdm(matrix):
            for j in matrix[i]:
                X_ij = matrix[i][j]
                weight = weighting_fn(X_ij, x_max, alpha)
                inner_product = np.dot(W[i], W_tilde[j])
                loss = inner_product + b[i] + b_tilde[j] - math.log(X_ij)
                weighted_loss = weight * loss

                total_loss += 0.5 * weighted_loss ** 2

                grad = weighted_loss
                grad_W_i = grad * W_tilde[j]
                grad_W_tilde_j = grad * W[i]

                # AdaGrad update
                W[i] -= learning_rate * grad_W_i / np.sqrt(gradsq_W[i] + 1e-8)
                W_tilde[j] -= learning_rate * grad_W_tilde_j / np.sqrt(gradsq_W_tilde[j] + 1e-8)
                b[i] -= learning_rate * grad / np.sqrt(gradsq_b[i] + 1e-8)
                b_tilde[j] -= learning_rate * grad / np.sqrt(gradsq_b_tilde[j] + 1e-8)

                # Update squared gradients
                gradsq_W[i] += grad_W_i ** 2
                gradsq_W_tilde[j] += grad_W_tilde_j ** 2
                gradsq_b[i] += grad ** 2
                gradsq_b_tilde[j] += grad ** 2

        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

        # Early stopping logic
        if total_loss < best_loss:
            best_loss = total_loss
            best_params = (W.copy(), W_tilde.copy(), b.copy(), b_tilde.copy())
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print(f"Stopping early at epoch {epoch+1} due to no improvement.")
                break

    W, W_tilde, _, _ = best_params  # Restore best weights
    return W + W_tilde

In [11]:
vocab = Vocab(sentences, min_count = 1)

In [12]:
print(len(vocab))

33961


In [13]:
matrix = cocurrence_matrix(sentences, vocab, window_size = 5)

100%|██████████| 61425/61425 [00:20<00:00, 3000.30it/s]


In [None]:
vectors = train_glove(matrix, vocab_size=len(vocab), embedding_dim=100, epochs=100)

100%|██████████| 33961/33961 [03:28<00:00, 162.52it/s] 


Epoch 1, Loss: 163909.6510


100%|██████████| 33961/33961 [03:03<00:00, 184.95it/s] 


Epoch 2, Loss: 146290.2717


100%|██████████| 33961/33961 [01:02<00:00, 539.71it/s] 


Epoch 3, Loss: 135444.4122


100%|██████████| 33961/33961 [03:37<00:00, 156.38it/s] 


Epoch 4, Loss: 116118.1023


100%|██████████| 33961/33961 [04:04<00:00, 138.78it/s] 


Epoch 5, Loss: 97133.7254


100%|██████████| 33961/33961 [03:40<00:00, 154.22it/s] 


Epoch 6, Loss: 81267.8543


100%|██████████| 33961/33961 [02:33<00:00, 221.26it/s] 


Epoch 7, Loss: 69003.1394


100%|██████████| 33961/33961 [01:02<00:00, 542.81it/s] 


Epoch 8, Loss: 59653.3694


100%|██████████| 33961/33961 [01:02<00:00, 539.44it/s] 


Epoch 9, Loss: 52420.5939


100%|██████████| 33961/33961 [00:58<00:00, 583.45it/s] 


Epoch 10, Loss: 46727.9963


100%|██████████| 33961/33961 [01:00<00:00, 558.45it/s] 


Epoch 11, Loss: 42183.7444


100%|██████████| 33961/33961 [01:02<00:00, 547.43it/s] 


Epoch 12, Loss: 38513.3294


100%|██████████| 33961/33961 [01:02<00:00, 547.72it/s] 


Epoch 13, Loss: 35517.4328


100%|██████████| 33961/33961 [01:47<00:00, 315.30it/s] 


Epoch 14, Loss: 33047.3291


100%|██████████| 33961/33961 [03:47<00:00, 149.51it/s] 


Epoch 15, Loss: 30989.9750


100%|██████████| 33961/33961 [03:49<00:00, 148.21it/s] 


Epoch 16, Loss: 29258.6236


100%|██████████| 33961/33961 [03:40<00:00, 153.72it/s] 


Epoch 17, Loss: 27786.5176


100%|██████████| 33961/33961 [01:11<00:00, 473.07it/s] 


Epoch 18, Loss: 26522.2474


100%|██████████| 33961/33961 [00:59<00:00, 566.04it/s] 


Epoch 19, Loss: 25426.1286


100%|██████████| 33961/33961 [01:00<00:00, 559.09it/s] 


Epoch 20, Loss: 24467.3587


100%|██████████| 33961/33961 [01:01<00:00, 555.02it/s] 


Epoch 21, Loss: 23621.8373


100%|██████████| 33961/33961 [01:03<00:00, 534.73it/s] 


Epoch 22, Loss: 22870.5362


100%|██████████| 33961/33961 [01:00<00:00, 561.47it/s] 


Epoch 23, Loss: 22198.2952


100%|██████████| 33961/33961 [00:59<00:00, 568.46it/s] 


Epoch 24, Loss: 21592.9339


100%|██████████| 33961/33961 [01:00<00:00, 561.42it/s] 


Epoch 25, Loss: 21044.5895


100%|██████████| 33961/33961 [00:59<00:00, 567.84it/s] 


Epoch 26, Loss: 20545.2177


100%|██████████| 33961/33961 [01:01<00:00, 554.60it/s] 


Epoch 27, Loss: 20088.2105


100%|██████████| 33961/33961 [03:14<00:00, 174.81it/s] 


Epoch 28, Loss: 19668.1024


100%|██████████| 33961/33961 [01:00<00:00, 560.31it/s] 


Epoch 29, Loss: 19280.3416


100%|██████████| 33961/33961 [01:00<00:00, 559.74it/s] 


Epoch 30, Loss: 18921.1112


100%|██████████| 33961/33961 [02:42<00:00, 209.63it/s] 


Epoch 31, Loss: 18587.1886


100%|██████████| 33961/33961 [01:14<00:00, 458.76it/s] 


Epoch 32, Loss: 18275.8354


100%|██████████| 33961/33961 [01:12<00:00, 470.84it/s] 


Epoch 33, Loss: 17984.7093


100%|██████████| 33961/33961 [02:10<00:00, 259.65it/s] 


Epoch 34, Loss: 17711.7950


100%|██████████| 33961/33961 [02:18<00:00, 245.86it/s] 


Epoch 35, Loss: 17455.3476


100%|██████████| 33961/33961 [02:51<00:00, 197.92it/s] 


Epoch 36, Loss: 17213.8487


100%|██████████| 33961/33961 [03:02<00:00, 185.70it/s] 


Epoch 37, Loss: 16985.9694


100%|██████████| 33961/33961 [03:00<00:00, 187.67it/s] 


Epoch 38, Loss: 16770.5416


100%|██████████| 33961/33961 [02:57<00:00, 190.82it/s] 


Epoch 39, Loss: 16566.5330


100%|██████████| 33961/33961 [02:58<00:00, 190.69it/s] 


Epoch 40, Loss: 16373.0281


100%|██████████| 33961/33961 [02:59<00:00, 188.74it/s] 


Epoch 41, Loss: 16189.2112


100%|██████████| 33961/33961 [02:58<00:00, 190.26it/s] 


Epoch 42, Loss: 16014.3529


100%|██████████| 33961/33961 [02:55<00:00, 193.30it/s] 


Epoch 43, Loss: 15847.7987


100%|██████████| 33961/33961 [03:00<00:00, 188.47it/s] 


Epoch 44, Loss: 15688.9591


100%|██████████| 33961/33961 [03:00<00:00, 187.65it/s] 


Epoch 45, Loss: 15537.3017


100%|██████████| 33961/33961 [02:08<00:00, 265.07it/s] 


Epoch 46, Loss: 15392.3442


100%|██████████| 33961/33961 [03:05<00:00, 183.49it/s] 


Epoch 47, Loss: 15253.6481


100%|██████████| 33961/33961 [02:37<00:00, 215.12it/s] 


Epoch 48, Loss: 15120.8141


100%|██████████| 33961/33961 [02:56<00:00, 192.50it/s] 


Epoch 49, Loss: 14993.4776


100%|██████████| 33961/33961 [02:36<00:00, 217.66it/s] 


Epoch 50, Loss: 14871.3045


100%|██████████| 33961/33961 [02:56<00:00, 192.36it/s] 


Epoch 51, Loss: 14753.9882


100%|██████████| 33961/33961 [03:19<00:00, 170.50it/s] 


Epoch 52, Loss: 14641.2469


100%|██████████| 33961/33961 [03:08<00:00, 180.20it/s] 


Epoch 53, Loss: 14532.8206


100%|██████████| 33961/33961 [03:09<00:00, 179.68it/s] 


Epoch 54, Loss: 14428.4696


100%|██████████| 33961/33961 [03:06<00:00, 181.70it/s] 


Epoch 55, Loss: 14327.9715


100%|██████████| 33961/33961 [02:10<00:00, 260.16it/s] 


Epoch 56, Loss: 14231.1208


100%|██████████| 33961/33961 [01:06<00:00, 510.09it/s] 


Epoch 57, Loss: 14137.7261


100%|██████████| 33961/33961 [01:05<00:00, 521.79it/s] 


Epoch 58, Loss: 14047.6098


100%|██████████| 33961/33961 [01:04<00:00, 525.71it/s] 


Epoch 59, Loss: 13960.6062


100%|██████████| 33961/33961 [01:03<00:00, 533.67it/s] 


Epoch 60, Loss: 13876.5608


100%|██████████| 33961/33961 [01:03<00:00, 536.77it/s] 


Epoch 61, Loss: 13795.3293


100%|██████████| 33961/33961 [01:02<00:00, 541.19it/s] 


Epoch 62, Loss: 13716.7767


100%|██████████| 33961/33961 [01:04<00:00, 526.04it/s] 


Epoch 63, Loss: 13640.7764


100%|██████████| 33961/33961 [01:03<00:00, 538.70it/s] 


Epoch 64, Loss: 13567.2098


100%|██████████| 33961/33961 [01:00<00:00, 563.82it/s] 


Epoch 65, Loss: 13495.9655


100%|██████████| 33961/33961 [00:59<00:00, 570.26it/s] 


Epoch 66, Loss: 13426.9388


100%|██████████| 33961/33961 [00:59<00:00, 566.66it/s] 


Epoch 67, Loss: 13360.0310


100%|██████████| 33961/33961 [01:02<00:00, 545.64it/s] 


Epoch 68, Loss: 13295.1494


100%|██████████| 33961/33961 [02:38<00:00, 214.17it/s] 


Epoch 69, Loss: 13232.2063


100%|██████████| 33961/33961 [01:26<00:00, 392.04it/s] 


Epoch 70, Loss: 13171.1190


100%|██████████| 33961/33961 [00:58<00:00, 577.87it/s] 


Epoch 71, Loss: 13111.8097


100%|██████████| 33961/33961 [00:59<00:00, 566.49it/s] 


Epoch 72, Loss: 13054.2043


100%|██████████| 33961/33961 [01:01<00:00, 555.67it/s] 


Epoch 73, Loss: 12998.2330


100%|██████████| 33961/33961 [01:00<00:00, 559.77it/s] 


Epoch 74, Loss: 12943.8296


 10%|▉         | 3251/33961 [00:38<03:10, 161.56it/s]