In [None]:

!pip install underthesea

Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[?25hDownloading underthesea_core-1.0.4-cp310-cp310-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m33.6

In [None]:
#BM25 Implementation

In [18]:
# Precompute TF values
import pandas as pd
import numpy as np
from collections import Counter
from scipy.sparse import csr_matrix, save_npz
import json

def compute_tf_matrix(corpus_csv, vocab_csv, output_prefix="bm25"):
    # Load data
    df_corpus = pd.read_csv(corpus_csv)  # Now using train_tokenized.csv
    df_vocab = pd.read_csv(vocab_csv)

    # Get tokenized texts and vocab words
    tokenized_docs = df_corpus["context_tokenized"].astype(str).apply(str.split).tolist()
    vocab_words = df_vocab["word"].astype(str).tolist()

    # Build word-to-index vocab
    word2idx = {word: i for i, word in enumerate(vocab_words)}
    num_docs = len(tokenized_docs)
    vocab_size = len(word2idx)

    # Build sparse matrix components
    data, rows, cols = [], [], []

    for doc_id, tokens in enumerate(tokenized_docs):
        tf_counter = Counter(tokens)
        for word, freq in tf_counter.items():
            if word in word2idx:
                rows.append(doc_id)
                cols.append(word2idx[word])
                data.append(freq)

    # Build sparse TF matrix
    tf_matrix = csr_matrix((data, (rows, cols)), shape=(num_docs, vocab_size), dtype=np.uint16)

    # Save matrix and mappings
    save_npz(f"{output_prefix}_tf.npz", tf_matrix)
    with open(f"{output_prefix}_vocab_index.json", "w") as f:
        json.dump(word2idx, f)

    print(f"✅ TF matrix saved to {output_prefix}_tf.npz")
    print(f"✅ Vocabulary index saved to {output_prefix}_vocab_index.json")

# Run it with correct inputs
compute_tf_matrix("/kaggle/working/train_tokenized.csv", "vocabulary.csv")


✅ TF matrix saved to bm25_tf.npz
✅ Vocabulary index saved to bm25_vocab_index.json


In [19]:
import numpy as np
import pandas as pd
from scipy.sparse import load_npz
import json
from collections import Counter
from tqdm import tqdm

def load_bm25_components():
    tf_matrix = load_npz("/kaggle/input/BM25/bm25_tf.npz")
    with open("/kaggle/input/BM25/bm25_vocab_index.json", "r") as f:
        word2idx = json.load(f)
    return tf_matrix, word2idx

def compute_idf(tf_matrix, num_docs):
    doc_freq = np.bincount(tf_matrix.nonzero()[1], minlength=tf_matrix.shape[1])
    idf = np.log((num_docs - doc_freq + 0.5) / (doc_freq + 0.5) + 1.0)
    return idf

def compute_doc_lengths(tf_matrix):
    return np.array(tf_matrix.sum(axis=1)).flatten()

def bm25_score(query_tokens, tf_matrix, idf, doc_lengths, avg_doc_len, word2idx, k1=1.5, b=0.75):
    scores = np.zeros(tf_matrix.shape[0])
    query_counts = Counter(query_tokens)
    for term, tf_query in query_counts.items():
        if term in word2idx:
            term_idx = word2idx[term]
            term_tf = tf_matrix[:, term_idx].toarray().flatten()
            numerator = term_tf * (k1 + 1)
            denominator = term_tf + k1 * (1 - b + b * doc_lengths / avg_doc_len)
            scores += idf[term_idx] * numerator / denominator
    return scores

def evaluate_bm25(test_set_path, full_corpus_path, k_values=[5, 10, 20]):
    test_df = pd.read_csv(test_set_path)
    full_df = pd.read_csv(full_corpus_path)

    tf_matrix, word2idx = load_bm25_components()
    num_docs = tf_matrix.shape[0]
    idf = compute_idf(tf_matrix, num_docs)
    doc_lengths = compute_doc_lengths(tf_matrix)
    avg_doc_len = np.mean(doc_lengths)

    cid_to_idx = {cid: idx for idx, cid in enumerate(full_df["cid"].values)}
    cids = full_df["cid"].values

    reciprocal_ranks = []
    correct_at_k = {k: 0 for k in k_values}
    total_queries = 0

    for _, row in tqdm(test_df.iterrows(), desc="Evaluating queries", total=len(test_df)):
        query_tokens = row["question_tokenized"].split()
        correct_cid = row["cid"]

        if correct_cid in cid_to_idx:
            scores = bm25_score(query_tokens, tf_matrix, idf, doc_lengths, avg_doc_len, word2idx)
            top_doc_indices = np.argsort(-scores)
            top_cids = [cids[idx] for idx in top_doc_indices]

            if correct_cid in top_cids:
                rank = top_cids.index(correct_cid) + 1
                reciprocal_ranks.append(1.0 / rank)
                for k in k_values:
                    if rank <= k:
                        correct_at_k[k] += 1
            else:
                reciprocal_ranks.append(0.0)

            total_queries += 1

    mrr = np.mean(reciprocal_ranks) if reciprocal_ranks else 0
    acc_at_k = {k: (correct_at_k[k] / total_queries) * 100 if total_queries > 0 else 0 for k in k_values}

    print(f"\nMean Reciprocal Rank (MRR): {mrr:.4f}")
    for k in k_values:
        print(f"Accuracy@{k}: {acc_at_k[k]:.2f}%")

    return mrr, acc_at_k

evaluate_bm25(
    test_set_path="/kaggle/input/test_set.csv",
    full_corpus_path="/kaggle/input/train_tokenized.csv",
    k_values=[5, 10, 20]
)


Evaluating queries:   0%|          | 103/23892 [00:33<2:09:43,  3.06it/s]


KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import load_npz
import json
from collections import Counter
from underthesea import word_tokenize

# Load BM25 components
tf_matrix = load_npz("/kaggle/input/BM25/bm25_tf.npz")
with open("/kaggle/input/BM25/bm25_vocab_index.json", "r") as f:
    word2idx = json.load(f)

num_docs = tf_matrix.shape[0]

def compute_idf(tf_matrix, num_docs):
    doc_freq = np.bincount(tf_matrix.nonzero()[1], minlength=tf_matrix.shape[1])
    idf = np.log((num_docs - doc_freq + 0.5) / (doc_freq + 0.5) + 1.0)
    return idf

def compute_doc_lengths(tf_matrix):
    return np.array(tf_matrix.sum(axis=1)).flatten()

def bm25_score(query_tokens, tf_matrix, idf, doc_lengths, avg_doc_len, word2idx, k1=1.5, b=0.75):
    scores = np.zeros(tf_matrix.shape[0])
    query_counts = Counter(query_tokens)
    for term, tf_query in query_counts.items():
        if term in word2idx:
            term_idx = word2idx[term]
            term_tf = tf_matrix[:, term_idx].toarray().flatten()
            numerator = term_tf * (k1 + 1)
            denominator = term_tf + k1 * (1 - b + b * doc_lengths / avg_doc_len)
            scores += idf[term_idx] * numerator / denominator
    return scores

# Precompute IDF and doc lengths
idf = compute_idf(tf_matrix, num_docs)
doc_lengths = compute_doc_lengths(tf_matrix)
avg_doc_len = np.mean(doc_lengths)

# Load corpus and map cid to tokenized text
df_corpus = pd.read_csv("/kaggle/input/corpus_tokenized.csv", usecols=["cid", "context_tokenized"])
cid_to_index = {cid: idx for idx, cid in enumerate(df_corpus['cid'].drop_duplicates())}
index_to_cid = {idx: cid for cid, idx in cid_to_index.items()}
cid_to_text = df_corpus.drop_duplicates("cid").set_index("cid")["context_tokenized"].to_dict()
del df_corpus

# ---------- BM25 Inference Function ----------

def bm25_infer(question_text, top_k=10):
    query_tokens = word_tokenize(question_text, format="text").split()
    scores = bm25_score(query_tokens, tf_matrix, idf, doc_lengths, avg_doc_len, word2idx)
    top_indices = np.argsort(-scores)[:top_k]

    results = []
    for rank, idx in enumerate(top_indices):
        cid = index_to_cid.get(idx, -1)
        if cid != -1:
            results.append({
                "rank": rank + 1,
                "cid": cid,
                "score": float(scores[idx]),
                "context": cid_to_text.get(cid, "Not found")
            })
    return results

# ---------- Example Usage ----------
question = "Người lao động được nghỉ bao nhiêu ngày phép trong năm?"
top_results = bm25_infer(question, top_k=10)

# Print top results
for res in top_results:
    print(f"[{res['rank']}] CID: {res['cid']} | Score: {res['score']:.4f}")
    print(f"Context: {res['context']}\n")


Loading BM25 components...
Loaded TF matrix with shape: (133568, 47872)
Vocabulary size: 47872
Number of documents: 133568
Loading document corpus...

Example query: 'nội dung lồng ghép vấn đề bình đẳng giới trong xây dựng văn bản quy phạm pháp luật được quy định thế nào'
Retrieving top 20 documents...
Tokenized query: nội_dung lồng_ghép vấn_đề bình_đẳng giới trong xây_dựng văn_bản quy_phạm_pháp_luật được quy_định thế_nào

Results for 'nội dung lồng ghép vấn đề bình đẳng giới trong xây dựng văn bản quy phạm pháp luật được quy định thế nào':

[1] Document ID: 137189 (Score: 65.2875)
Text: thẻ hội_viên ban_chấp_hành trung_ương hội quy_định việc cấp_phát quản_lý và thu_hồi thẻ hội_viên

[3] Document ID: 46208 (Score: 56.0944)
Text: điều_độ tuổi thời_hạn thực_hiện nghĩa_vụ tham_gia dân_quân tự_vệ trong thời_bình công_dân nam từ đủ tuổi đến hết tuổi công_dân nữ từ đủ tuổi đến hết tuổi có nghĩa_vụ tham_gia dân_quân tự_vệ nếu tình_nguyện tham_gia dân_quân tự_vệ thì có_thể kéo_dài đến hết tuổi