# **TF_IDF & BM25**

In [None]:
import os
import json
import joblib
import pickle
import py_vncorenlp
import re
import string
from qdrant_client import QdrantClient
from qdrant_client.http.models import Filter, FieldCondition, MatchAny
from rank_bm25 import BM25Okapi
from sklearn.preprocessing import normalize


In [2]:
current_dir = os.getcwd()
PROJECT_DIR = os.path.dirname(current_dir)
data_path = os.path.join(PROJECT_DIR, "dataset", "processed_legal_corpus.json")
MODEL_DIR = "D:/VnCoreNLP" # This is belong to your file location 
DATASET_DIR = os.path.join(PROJECT_DIR, "dataset")
VECTORIZER_DIR = os.path.join(PROJECT_DIR, "vectorizer")

In [3]:
with open(data_path, "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

# Danh sách văn bản
content = []
index = [] # "01/2009/tt-bnn 1" (law_id + article_id)

for document in data:
    law_id = document["law_id"]
    for article in document["articles"]:
        article_id = article["article_id"]
        text = article["processed_in4"]
        clean_tokens = [token for token in text.split() if token not in string.punctuation]
        clean_text = ' '.join(clean_tokens)
        content.append(clean_text)
        index.append(law_id + " " + article_id)
print(len(content))
print(len(index))

61425
61425


In [None]:
tokenized_corpus = [doc.split() for doc in content]
bm25 = BM25Okapi(tokenized_corpus)

with open(os.path.join(VECTORIZER_DIR, "bm25.pkl"), "wb") as f:
    pickle.dump(bm25, f)

In [4]:
tf_idf_vectorizer = joblib.load(os.path.join(VECTORIZER_DIR, 'tfidf_vectorizer.pkl'))
with open(os.path.join(VECTORIZER_DIR, "bm25.pkl"), "rb") as f:
    bm25 = pickle.load(f)

In [5]:
## Initial the segmentation model and the pattern to remove the stop word
model = py_vncorenlp.VnCoreNLP(save_dir= MODEL_DIR)
with open(os.path.join(DATASET_DIR, "stopwords_processed.txt"), "r", encoding="utf-8") as f:
    stopwords_list = list(map(str.strip, f))

pattern = r"\b(" + "|".join(map(re.escape, stopwords_list)) + r")\b"

In [9]:
client = QdrantClient(host="localhost", port=6333)

In [None]:
def precessing_query(query, segmented_model):
    query_list = segmented_model.word_segment(query) # segment
    query = " ".join(query_list) 
    query = re.sub(pattern, "", query) # remove stop word
    query = re.sub(r"\s+", " ", query).strip() # Remove the redundancy
    query_word = [token for token in query.split() if token not in string.punctuation]
    clean_query = ' '.join(query_word)

    return clean_query

In [None]:
"""
    REQUIRE TO:
        ACCESS AS THE CLIENT TO THE QDRANT
        LOAD THE SEGMENTED model
        LOAD THE BM25
        LOAD THE TFIDF VECTORIZER
"""
def get_result(query, vectorizer, bm25, segmented_model, client, N = 50):
    cleaned_query = precessing_query(query, segmented_model)

    # filter by bm25
    bm25_scores = bm25.get_scores(cleaned_query.split())
    top_n_idx = sorted(range(len(bm25_scores)), key=lambda i: -bm25_scores[i])[:N]

    query_vec = vectorizer.transform([cleaned_query])
    query_vec = normalize(query_vec).toarray()[0]

    hits = client.search(
        collection_name="tfidf_search_Law_document_retrivial",
        query_vector=query_vec,  # convert từ numpy sang list
        limit=50,
        with_payload=True,
        query_filter=Filter(
            must=[
                FieldCondition(
                    key="doc_id",
                    match=MatchAny(any=top_n_idx)  # danh sách ID (int hoặc str)
                )
            ]
        )
    )

    # Combine the score of tf_idf and bm25 to get the final score and rerank the answer
    final_results = []
    for hit in hits:
        qdrant_score = hit.score               # cosine similarity
        doc_id = hit.id                        # id của tài liệu
        bm25_score = bm25_scores[int(doc_id)]  # phải đảm bảo doc_id khớp thứ tự BM25

        # Normalize BM25 score
        max_bm25 = max(bm25_scores) or 1
        bm25_score_norm = bm25_score / max_bm25

        # Kết hợp
        alpha = 0.7
        combined_score = alpha * qdrant_score + (1 - alpha) * bm25_score_norm

        final_results.append({
            "id": doc_id,
            "score": combined_score,
            "qdrant_score": qdrant_score,
            "bm25_score": bm25_score_norm,
            "payload": hit.payload
        })

    # Sắp xếp kết quả
    final_results.sort(key=lambda x: x["score"], reverse=True)

    return final_results

In [39]:
query = "Thời gian, hình thức giáo dục pháp luật, giáo dục công dân trong trại giam của phạm nhân được quy định như thế nào?"
final_results = get_result(query, tf_idf_vectorizer, bm25, model, client, N = 50)

  hits = client.search(


In [40]:
for result in final_results[:10]:
    print(f"Score: {result['score']} | law_id : {result['payload']['law_id']} | article_id : {result['payload']['article_id']}")

Score: 0.79932785 | law_id : 02/2012/ttlt-bca-bqp-btp-bgdđt | article_id : 9
Score: 0.6885901076850319 | law_id : 02/2012/ttlt-bca-bqp-btp-bgdđt | article_id : 10
Score: 0.6641323449370763 | law_id : 02/2012/ttlt-bca-bqp-btp-bgdđt | article_id : 11
Score: 0.6089688159535045 | law_id : 02/2012/ttlt-bca-bqp-btp-bgdđt | article_id : 16
Score: 0.6066579030969421 | law_id : 133/2020/nđ-cp | article_id : 17
Score: 0.6062928282950482 | law_id : 12/2013/ttlt-bca-bqp-btc | article_id : 4
Score: 0.5758304497355422 | law_id : 02/2012/ttlt-bca-bqp-btp-bgdđt | article_id : 7
Score: 0.567360324472767 | law_id : 133/2020/nđ-cp | article_id : 12
Score: 0.5487301813158554 | law_id : 133/2020/nđ-cp | article_id : 16
Score: 0.5460582103428242 | law_id : 53/2010/qh12 | article_id : 28
