# **Combination between Glove and TF_IDF**

In [1]:
import json
import os
import py_vncorenlp
import numpy as np
import re
import joblib
import string

from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from sklearn.preprocessing import normalize

In [2]:
current_dir = os.getcwd()
PROJECT_DIR = os.path.dirname(current_dir)
data_path = os.path.join(PROJECT_DIR, "dataset", "processed_legal_corpus.json")
MODEL_DIR = "D:/VnCoreNLP" # This is belong to your file location 
DATASET_DIR = os.path.join(PROJECT_DIR, "dataset")
VECTORIZER_DIR = os.path.join(PROJECT_DIR, "vectorizer")

In [3]:
with open(data_path, "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

# Danh sách văn bản
content = []
index = [] # "01/2009/tt-bnn 1" (law_id + article_id)

for document in data:
    law_id = document["law_id"]
    for article in document["articles"]:
        article_id = article["article_id"]
        text = article["processed_in4"]
        clean_tokens = [token for token in text.split() if token not in string.punctuation]
        clean_text = ' '.join(clean_tokens)
        content.append(clean_text)
        index.append(law_id + " " + article_id)
print(len(content))
print(len(index))

61425
61425


In [4]:
Glove_path = os.path.join(VECTORIZER_DIR, "Glove_word_embed.json")
with open(Glove_path, 'r', encoding='utf-8') as f:
    Glove_word_embed = json.load(f)

# Load tf_idf for vectorizer
tfidf_vectorizer = joblib.load(os.path.join(VECTORIZER_DIR, 'tfidf_vectorizer.pkl'))

In [9]:
def get_weighted_sentence_vector(sentence, Glove_word_embed: dict, tfidf_vectorizer):
    tfidf_scores = tfidf_vectorizer.transform([sentence])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_dict = {
        feature_names[col]: tfidf_scores[0, col]
        for col in tfidf_scores.nonzero()[1]
    }

    words = sentence.split()
    word_vecs = []
    weights = []
    for word in words:
        if word in list(Glove_word_embed.keys()) and word in tfidf_dict:
            vec = np.array(Glove_word_embed[word], dtype=np.float64)
            weight = tfidf_dict[word]
            word_vecs.append(vec * weight)
            weights.append(weight)

    if word_vecs:
        return np.sum(word_vecs, axis=0) / np.sum(weights)
    else:
        return np.zeros(len(Glove_word_embed[list(Glove_word_embed.keys())[0]]))

In [10]:
sentence_vectors = np.array([
    get_weighted_sentence_vector(sentence, Glove_word_embed, tfidf_vectorizer)
    for sentence in content
])

normalized_vectors = normalize(sentence_vectors)
print(normalized_vectors.shape)

(61425, 100)


In [11]:
# 2. Connect to Qdrant
client = QdrantClient(host="localhost", port=6333, timeout=60.0)

# 3. Tạo collection
client.recreate_collection(
    collection_name="Glove_Law_document_retrivial",
    vectors_config=VectorParams(size=normalized_vectors.shape[1], distance=Distance.COSINE) # Using Cosinesimilarity for searching vector
)

# 4. Thêm dữ liệu
payloads = [{"law_id": doc.split(" ")[0], "article_id": doc.split(" ")[1]} for doc in index]
points = [
    {
        "id": i,
        "vector": normalized_vectors[i],
        "payload": payloads[i]
    }
    for i in range(len(content))
]

print(len(points))
print(len(payloads))

  client.recreate_collection(


61425
61425


In [13]:
# Upload the data to Qdrant collection
for i in range(0, len(points), 100):
    client.upsert(collection_name="Glove_Law_document_retrivial", points=points[i:i+100])

# **USING**

In [14]:
## Initial the segmentation model and the pattern to remove the stop word
vncorenlp_model = py_vncorenlp.VnCoreNLP(save_dir= MODEL_DIR)
with open(os.path.join(DATASET_DIR, "stopwords_processed.txt"), "r", encoding="utf-8") as f:
    stopwords_list = list(map(str.strip, f))

pattern = r"\b(" + "|".join(map(re.escape, stopwords_list)) + r")\b"

In [15]:
def clean_query(query, segmentation_model):
    query_list = segmentation_model.word_segment(query) # segment
    query = " ".join(query_list) 
    query = re.sub(pattern, "", query) # remove stop word
    query = re.sub(r"\s+", " ", query).strip() # Remove the redundancy
    query_word = [token for token in query.split() if token not in string.punctuation]
    cleaned_query = ' '.join(query_word)

    return cleaned_query

In [17]:
# 5. Nhập câu truy vấn và biến thành vector
query = "Đập phá biển báo “khu vực biên giới” bị phạt thế nào?"
cleaned_query = clean_query(query, vncorenlp_model)
query_vec = get_weighted_sentence_vector(cleaned_query, Glove_word_embed, tfidf_vectorizer)
query_vec = normalize(query_vec.reshape(1, -1))[0]

# 6. Truy vấn Qdrant
hits = client.search(
    collection_name="Glove_Law_document_retrivial",
    query_vector=query_vec,
    limit=10
)

# 7. In kết quả
for hit in hits:
    print(f"Score: {hit.score:.4f} | law_id: {hit.payload['law_id']} | article_id: {hit.payload['article_id']} ")

Score: 0.8234 | law_id: 71/2015/nđ-cp | article_id: 5 
Score: 0.8090 | law_id: 34/2014/nđ-cp | article_id: 9 
Score: 0.7946 | law_id: 09/2016/tt-bqp | article_id: 5 
Score: 0.7752 | law_id: 43/2015/tt-bqp | article_id: 5 
Score: 0.7550 | law_id: 47/2015/tt-bqp | article_id: 7 
Score: 0.7419 | law_id: 96/2020/nđ-cp | article_id: 10 
Score: 0.7145 | law_id: 43/2015/tt-bqp | article_id: 12 
Score: 0.6965 | law_id: 71/2015/nđ-cp | article_id: 17 
Score: 0.6933 | law_id: 32/2014/tt-bgtvt | article_id: 12 
Score: 0.6807 | law_id: 71/2015/nđ-cp | article_id: 3 


  hits = client.search(
