# **PhoBERT**

In [None]:
from huggingface_hub import login

access_token = "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"
login(access_token)

In [1]:
import os
import json
from sentence_transformers import SentenceTransformer, models
from sentence_transformers import InputExample
from torch.utils.data import DataLoader
from sentence_transformers import losses
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance




In [2]:
NOTEBOOK_DIR = os.getcwd()
PROJECT_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATASET_DIR = os.path.join(PROJECT_DIR, "dataset")
VECTORIZER_DIR = os.path.join(PROJECT_DIR, "vectorizer")
MODEL_DIR = "D:/VnCoreNLP"
corpus_path = os.path.join(PROJECT_DIR, "dataset", "processed_legal_corpus.json")

### **Create sentence encoder**

With encoder from pretrain model PhoBERT

Mean pooling to get the sentence vector

In [None]:
# Bước 1: Load encoder (PhoBERT)
phobert = models.Transformer("vinai/phobert-base-v2", max_seq_length=128)

# Bước 2: Thêm pooling (mean pooling để lấy sentence embedding)
pooling = models.Pooling(
    word_embedding_dimension=phobert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

# Bước 3: Kết hợp thành mô hình sentence embedding
model = SentenceTransformer(modules=[phobert, pooling])

### **Train data**

In [None]:
# Open train set
data_path = os.path.join(DATASET_DIR, "processed_train_data.json")

with open(data_path, "r", encoding="utf-8") as file:
    data = json.load(file)

print(len(data))

In [None]:
train_data = []
n = len(data)
train_num = int((n/10)*8)
test_num = n - train_num

for i in range(train_num):
    train_data.append(InputExample(texts=data[i]))

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=2)

print(train_num)
print(test_num)

In [None]:
train_loss = losses.MultipleNegativesRankingLoss(model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=10,
    show_progress_bar=True
)

In [None]:
# Lưu
model.save("phobert-sentence-embedding")

### **Load the model**

In [3]:
# Load lại để sử dụng
sentence_embedded_model = SentenceTransformer(os.path.join(VECTORIZER_DIR, "phobert"), device='cuda')

You try to use a model that was created with version 4.1.0, however, your version is 3.4.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [4]:
print(sentence_embedded_model.device)  # -> cuda:0 hoặc cpu

cuda:0


In [5]:
# READ THE CORPUS TO get the content

with open(corpus_path, "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

# Danh sách văn bản
content = []
index = [] # "01/2009/tt-bnn 1" (law_id + article_id)

for document in data:
    law_id = document["law_id"]
    for article in document["articles"]:
        article_id = article["article_id"]
        text = article["segment_only"]
        content.append(text)
        index.append(law_id + " " + article_id)
print(len(content))
print(len(index))

61425
61425


In [6]:
array = sentence_embedded_model.encode(content, batch_size=4, show_progress_bar=True)

print(array.shape)

Batches:   0%|          | 0/15357 [00:00<?, ?it/s]

(61425, 768)


In [7]:
# 2. Kết nối Qdrant
client = QdrantClient(host="localhost", port=6333)
client.recreate_collection(
    collection_name="PhoBERT_Embedded_Law_Retrieval",
    vectors_config=VectorParams(size=array.shape[1], distance=Distance.COSINE) # Using Cosinesimilarity for searching vector
)
payloads = [{"law_id": index[i].split(" ")[0], "article_id": index[i].split(" ")[1], "doc_id": i} for i in range(len(index))]
# points = [(i, vectors[i], payloads[i]) for i in range(len(documents))]
points = [
    {
        "id": i,
        "vector": array[i],
        "payload": payloads[i]
    }
    for i in range(len(content))
]
for i in range(0, len(points), 100):
    client.upsert(collection_name="PhoBERT_Embedded_Law_Retrieval", points=points[i:i+100])

  client.recreate_collection(


## **USING**

In [8]:
import py_vncorenlp
from sklearn.preprocessing import normalize

In [9]:
vncoreNLP_model = py_vncorenlp.VnCoreNLP(save_dir= MODEL_DIR)

In [10]:

def clean_query(query):
    query_list = vncoreNLP_model.word_segment(query) # segment
    cleaned_query = " ".join(query_list)
    return cleaned_query

def vectorize_query(query):
    cleaned_query = clean_query(query)
    query_vector = sentence_embedded_model.encode(cleaned_query)
    query_vector = normalize(query_vector.reshape(1, -1))[0]
    return query_vector
    

In [11]:
query = "Trách nhiệm của Bộ Khoa học và Công nghệ về quản lý và phát triển công nghiệp an ninh được quy định như thế nào?"

query_vector = vectorize_query(query)

hits = client.search(
    collection_name="PhoBERT_Embedded_Law_Retrieval",
    query_vector=query_vector,
    limit=10
)

for hit in hits:
    print(f"Score: {hit.score:.4f} | law_id: {hit.payload['law_id']} | article_id: {hit.payload['article_id']} ")

Score: 0.7815 | law_id: 63/2020/nđ-cp | article_id: 20 
Score: 0.7489 | law_id: 63/2020/nđ-cp | article_id: 19 
Score: 0.7371 | law_id: 63/2020/nđ-cp | article_id: 1 
Score: 0.6894 | law_id: 63/2020/nđ-cp | article_id: 21 
Score: 0.6771 | law_id: 63/2020/nđ-cp | article_id: 4 
Score: 0.6401 | law_id: 63/2020/nđ-cp | article_id: 14 
Score: 0.6203 | law_id: 63/2020/nđ-cp | article_id: 18 
Score: 0.6135 | law_id: 63/2020/nđ-cp | article_id: 7 
Score: 0.6130 | law_id: 63/2020/nđ-cp | article_id: 9 
Score: 0.5894 | law_id: 63/2020/nđ-cp | article_id: 10 


  hits = client.search(
