# **TF-IDF Vectorizer**

### **Set up for working**

In [None]:
import os
import json
import joblib
import py_vncorenlp
import re
# import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize


In [None]:
current_dir = os.getcwd()
PROJECT_DIR = os.path.dirname(current_dir)
data_path = os.path.join(PROJECT_DIR, "dataset", "processed_legal_corpus.json")
MODEL_DIR = "D:/VnCoreNLP" # This is belong to your file location 
DATASET_DIR = os.path.join(PROJECT_DIR, "dataset")
VECTORIZER_DIR = os.path.join(PROJECT_DIR, "vectorizer")

In [None]:
## Initial the segmentation model and the pattern to remove the stop word
model = py_vncorenlp.VnCoreNLP(save_dir= MODEL_DIR)
with open(os.path.join(DATASET_DIR, "stopwords_processed.txt"), "r", encoding="utf-8") as f:
    stopwords_list = list(map(str.strip, f))

pattern = r"\b(" + "|".join(map(re.escape, stopwords_list)) + r")\b"

**Read the corpus**

In [None]:
with open(data_path, "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

# Danh sách văn bản
content = []
index = [] # "01/2009/tt-bnn 1" (law_id + article_id)

for document in data:
    law_id = document["law_id"]
    for article in document["articles"]:
        article_id = article["article_id"]
        content.append(article["processed_in4"])
        index.append(law_id + " " + article_id)
print(len(content))
print(len(index))


## **Vectorization by TF_IDF**

In [None]:
# 1. TF-IDF vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(content)
vectors = normalize(tfidf_matrix).toarray()  # chuẩn hóa để cosine hoạt động tốt

# Lưu vectorizer đã huấn luyện
joblib.dump(vectorizer, os.path.join(VECTORIZER_DIR, 'tfidf_vectorizer.pkl'))

In [None]:
# 2. Kết nối Qdrant
client = QdrantClient(host="localhost", port=6333)

In [None]:
# 3. Tạo collection
client.recreate_collection(
    collection_name="tfidf_search_Law_document_retrivial",
    vectors_config=VectorParams(size=vectors.shape[1], distance=Distance.COSINE) # Using Cosinesimilarity for searching vector
)

In [None]:
# 4. Thêm dữ liệu
payloads = [{"law_id": doc.split(" ")[0], "article_id": doc.split(" ")[1]} for doc in index]
# points = [(i, vectors[i], payloads[i]) for i in range(len(documents))]
points = [
    {
        "id": i,
        "vector": vectors[i],
        "payload": payloads[i]
    }
    for i in range(len(content))
]
for i in range(0, len(points), 100):
    client.upsert(collection_name="tfidf_search_Law_document_retrivial", points=points[i:i+100])

## **Using**

In [None]:
# 5. Nhập câu truy vấn và biến thành vector
query = "Đập phá biển báo “khu vực biên giới” bị phạt thế nào?"
query_list = model.word_segment(query) # segment
query = " ".join(query_list) 
query = re.sub(pattern, "", query) # remove stop word
query = re.sub(r"\s+", " ", query).strip() # Remove the redundancy
query_vec = vectorizer.transform([query])
query_vec = normalize(query_vec).toarray()[0]

# 6. Truy vấn Qdrant
hits = client.search(
    collection_name="tfidf_search_Law_document_retrivial",
    query_vector=query_vec,
    limit=10
)

# 7. In kết quả
for hit in hits:
    print(f"Score: {hit.score:.4f} | law_id: {hit.payload['law_id']} | article_id: {hit.payload['article_id']} ")