In [15]:
!pip install underthesea
import os
os.environ["WANDB_MODE"] = "disabled"


Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.0

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
from scipy.sparse import csr_matrix, save_npz

# === CONFIG ===
VOCAB_PATH = '/kaggle/input/datanlpnew/vocabulary.csv'  # word, idf, pos
TRAIN_PATH = '/kaggle/input/datanlpnew/train_set.csv'
SAVE_PATH = './bm42_attention_weights.npz'
EPOCHS = 21

# --- Load vocab ---
vocab_df = pd.read_csv(VOCAB_PATH)
vocab = vocab_df['word'].tolist()
idf = vocab_df['idf'].values
pos = vocab_df['pos'].tolist()
word2idx = {w: i for i, w in enumerate(vocab)}
vocab_size = len(vocab)

# POS multiplier
pos_multiplier = np.array([1.5 if p in ('N', 'V') else 1.0 for p in pos], dtype=np.float32)

# --- Model ---
class BM42Attn(nn.Module):
    def __init__(self, init_weights):
        super().__init__()
        self.attn = nn.Parameter(torch.tensor(init_weights, dtype=torch.float32))
    def forward(self, idxs):
        return self.attn[idxs]

# --- Data ---
def get_doc_tokens(row, col, vocab_set):
    return [w for w in str(row[col]).split() if w in vocab_set]

def get_q_idxs(row, col, word2idx):
    return [word2idx[w] for w in str(row[col]).split() if w in word2idx]

def get_relevant_cids(row):
    return [int(cid) for cid in str(row['cid']).split(',') if cid.strip().isdigit()]

train_df = pd.read_csv(TRAIN_PATH)
vocab_set = set(vocab)

# Precompute all doc tokens for fast negatives (train only)
doc_tokens = {}
for _, row in train_df.iterrows():
    doc_tokens[int(str(row['cid']).split(',')[0])] = get_doc_tokens(row, 'context_tokenized', vocab_set)

# --- Training ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
init_weights = np.clip(np.ones(vocab_size, dtype=np.float32), 0.01, 1.99)
model = BM42Attn(init_weights).to(device)
optimizer = optim.Adam([model.attn], lr=0.03)

def bm42_score(q_idxs, doc, model, idf, pos_multiplier):
    doc_set = set(doc)
    idxs_in_doc = [i for i in q_idxs if vocab[i] in doc_set]
    idxs_in_doc_tensor = torch.tensor(idxs_in_doc, dtype=torch.long, device=device)
    if len(idxs_in_doc_tensor) == 0:
        return (model.attn[0:0] * torch.tensor([], dtype=torch.float32, device=device)).sum()
    attn = model(idxs_in_doc_tensor)
    idf_tensor = torch.tensor(idf[idxs_in_doc], dtype=torch.float32, device=device)
    pos_mult_tensor = torch.tensor(pos_multiplier[idxs_in_doc], dtype=torch.float32, device=device)
    return (attn * idf_tensor * pos_mult_tensor).sum()

for epoch in range(EPOCHS):
    model.train()
    losses = []
    for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc=f"Epoch {epoch+1}"):
        q_idxs = get_q_idxs(row, 'question_tokenized', word2idx)
        if not q_idxs:
            continue
        rel_cids = get_relevant_cids(row)
        if not rel_cids:
            continue
        pos_doc = get_doc_tokens(row, 'context_tokenized', vocab_set)
        # Negative: sample a random doc not in rel_cids
        neg_cid = np.random.choice([cid for cid in doc_tokens if cid not in rel_cids])
        neg_doc = doc_tokens[neg_cid]
        pos_score = bm42_score(q_idxs, pos_doc, model, idf, pos_multiplier)
        neg_score = bm42_score(q_idxs, neg_doc, model, idf, pos_multiplier)
        margin = 1.0
        loss = torch.clamp(margin - (pos_score - neg_score), min=0)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            model.attn.data.clamp_(0.01, 1.99)
        losses.append(loss.detach().cpu().item())
    print(f"Epoch {epoch+1} mean loss: {np.mean(losses):.4f}")
    # Decay learning rate every 3 epochs
    if (epoch + 1) % 3 == 0:
        for param_group in optimizer.param_groups:
            param_group['lr'] *= 0.9
        print(f"Learning rate decayed to {optimizer.param_groups[0]['lr']:.6f}")
    # Save (overwrite) after every epoch
    weights_np = model.attn.detach().cpu().numpy()
    sparse_weights = csr_matrix(weights_np.reshape(1,-1))
    save_npz(SAVE_PATH, sparse_weights)
    print(f"Saved model to {SAVE_PATH}")

print("Training complete.")

In [12]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz, csr_matrix
from tqdm.notebook import tqdm
from collections import defaultdict

# File paths
TEST_PATH = '/kaggle/input/datanlpnew/test_set.csv'
CORPUS_PATH = '/kaggle/input/datanlpnew/corpus_tokenized.csv'
VOCAB_PATH = '/kaggle/input/datanlpnew/vocabulary.csv'
BM42_WEIGHTS_PATH = '/kaggle/input/bm4222/bm42_attention_weights.npz'

# Load test set, corpus, and vocabulary
print("Loading data...")
test_df = pd.read_csv(TEST_PATH)
corpus_df = pd.read_csv(CORPUS_PATH)
vocab_df = pd.read_csv(VOCAB_PATH)

# Load BM42 attention weights
print("Loading BM42 weights...")
bm42_weights = load_npz(BM42_WEIGHTS_PATH).toarray()[0]  # Convert from sparse to dense

# Create word-to-index mapping and get IDF values
vocab = vocab_df['word'].tolist()
idf = vocab_df['idf'].values
pos = vocab_df['pos'].tolist()
word2idx = {w: i for i, w in enumerate(vocab)}

# Apply POS multiplier to attention weights
pos_multiplier = np.array([1.5 if p in ('N', 'V') else 1.0 for p in pos], dtype=np.float32)
attention_weights = bm42_weights * pos_multiplier

# Pre-compute weighted IDF values
weighted_idf = attention_weights * idf

# ---- OPTIMIZATION 1: Create inverted index ----
print("Building inverted index...")
inverted_index = defaultdict(list)
corpus_docs = {}
corpus_ids = []

for _, row in tqdm(corpus_df.iterrows(), total=len(corpus_df)):
    cid = int(row['cid'])
    corpus_ids.append(cid)
    tokens = str(row['context_tokenized']).split()
    # Filter and keep only tokens in vocabulary
    valid_tokens = [t for t in tokens if t in word2idx]
    corpus_docs[cid] = set(valid_tokens)  # Store as set for O(1) lookups
    
    # Build inverted index: for each token, store which documents contain it
    for token in set(valid_tokens):  # Use set to avoid duplicates
        inverted_index[token].append(cid)

# ---- OPTIMIZATION 2: Vectorized BM42 scoring ----
def fast_bm42_evaluate(test_df, corpus_docs, inverted_index, weighted_idf, word2idx, k_values=[5, 10, 20]):
    mrr_total = 0.0
    accuracy = {k: 0 for k in k_values}
    total_queries = 0
    
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating"):
        query_tokens = str(row['question_tokenized']).split()
        query_tokens = [t for t in query_tokens if t in word2idx]
        
        if not query_tokens:
            continue
            
        # Get relevant document IDs for this query
        try:
            relevant_cids = set(int(cid) for cid in str(row['cid']).split(','))
        except ValueError:
            continue
        
        # ---- OPTIMIZATION 3: Only score documents that contain at least one query term ----
        candidate_docs = set()
        for token in query_tokens:
            if token in inverted_index:
                candidate_docs.update(inverted_index[token])
        
        # If no candidates found, skip this query
        if not candidate_docs:
            continue
            
        # Score only candidate documents (not the entire corpus)
        scores = []
        for cid in candidate_docs:
            score = 0.0
            doc_tokens = corpus_docs[cid]
            
            # Sum weighted IDF for matching tokens
            for token in query_tokens:
                if token in doc_tokens:
                    idx = word2idx[token]
                    score += weighted_idf[idx]
                    
            scores.append((cid, score))
            
        # Sort by score (descending)
        scores.sort(key=lambda x: x[1], reverse=True)
        
        # Calculate metrics
        rank = float('inf')
        for i, (cid, _) in enumerate(scores):
            if cid in relevant_cids:
                rank = i + 1
                break
                
        # Update MRR if a relevant document was found
        if rank < float('inf'):
            mrr_total += 1.0 / rank
            
        # Update accuracy@k
        for k in k_values:
            top_k_cids = [cid for cid, _ in scores[:k]]
            if any(cid in relevant_cids for cid in top_k_cids):
                accuracy[k] += 1
                
        total_queries += 1
    
    # Calculate final metrics
    mrr = mrr_total / total_queries if total_queries > 0 else 0
    accuracy = {k: acc / total_queries if total_queries > 0 else 0 for k, acc in accuracy.items()}
    
    return mrr, accuracy, total_queries

# Run evaluation with optimized function
print("Starting optimized evaluation...")
mrr, accuracy, total_queries = fast_bm42_evaluate(
    test_df, 
    corpus_docs, 
    inverted_index, 
    weighted_idf, 
    word2idx
)

# Print results
print(f"\n===== BM42 Evaluation Results (Total Queries: {total_queries}) =====")
print(f"MRR: {mrr:.4f}")
for k, acc in accuracy.items():
    print(f"Accuracy@{k}: {acc:.4f}")                                 

Loading data...
Loading BM42 weights...
Building inverted index...


  0%|          | 0/68663 [00:00<?, ?it/s]

Starting optimized evaluation...


Evaluating:   0%|          | 0/23892 [00:00<?, ?it/s]


===== BM42 Evaluation Results (Total Queries: 23892) =====
MRR: 0.3742
Accuracy@5: 0.5130
Accuracy@10: 0.6090
Accuracy@20: 0.6961


In [20]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from underthesea import word_tokenize
from collections import defaultdict

CORPUS_PATH = '/kaggle/input/datanlpnew/corpus_tokenized.csv'
VOCAB_PATH = '/kaggle/input/datanlpnew/vocabulary.csv'
BM42_WEIGHTS_PATH = '/kaggle/input/bm4222/bm42_attention_weights.npz'

corpus_df = pd.read_csv(CORPUS_PATH)
vocab_df = pd.read_csv(VOCAB_PATH)
bm42_weights = load_npz(BM42_WEIGHTS_PATH).toarray().flatten()

vocab = vocab_df['word'].tolist()
idf = vocab_df['idf'].values
pos = vocab_df['pos'].tolist()
word2idx = {w: i for i, w in enumerate(vocab)}
vocab_set = set(vocab)
pos_multiplier = np.array([1.5 if p in ('N', 'V') else 1.0 for p in pos], dtype=np.float32)
attention_weights = bm42_weights * pos_multiplier

inverted_index = defaultdict(list)
corpus_docs = {}
corpus_texts = {}

for _, row in corpus_df.iterrows():
    cid = int(row['cid'])
    tokens = str(row['context_tokenized']).split()
    valid_tokens = [t for t in tokens if t in vocab_set]
    corpus_docs[cid] = set(valid_tokens)
    corpus_texts[cid] = str(row['context_tokenized'])
    for token in set(valid_tokens):
        inverted_index[token].append(cid)

def search_documents(query, top_k=20):
    tokenized_query = word_tokenize(query)
    if isinstance(tokenized_query, list):
        tokenized_query = ' '.join(tokenized_query)
    print(f"Tokenized query: {tokenized_query}")
    query_tokens = [t for t in tokenized_query.split() if t in vocab_set]
    if not query_tokens:
        print("No query tokens found in vocabulary.")
        return []
    candidate_docs = set()
    for token in query_tokens:
        if token in inverted_index:
            candidate_docs.update(inverted_index[token])
    if not candidate_docs:
        print("No matching documents found.")
        return []
    scores = []
    for cid in candidate_docs:
        score = 0.0
        doc_tokens = corpus_docs[cid]
        for token in query_tokens:
            if token in doc_tokens:
                idx = word2idx[token]
                score += attention_weights[idx] * idf[idx]
        scores.append((cid, score))
    scores.sort(key=lambda x: x[1], reverse=True)
    results = []
    for cid, score in scores[:top_k]:
        results.append({
            'cid': cid,
            'score': score,
            'text': corpus_texts[cid]
        })
    return results

example_query = "phó tổng giám đốc ngân hàng chính sách xã hội được xếp lương theo bảng lương như thế nào"
print(f"\nExample query: '{example_query}'")
results = search_documents(example_query)
print(f"\nTop {len(results)} results:")
for i, doc in enumerate(results):
    print(f"\n{i+1}. Document ID: {doc['cid']}")
    print(f"   Score: {doc['score']:.4f}")
    preview = doc['text'][:200] + "..." if len(doc['text']) > 200 else doc['text']
    print(f"   Preview: {preview}")



Example query: 'phó tổng giám đốc ngân hàng chính sách xã hội được xếp lương theo bảng lương như thế nào'
Tokenized query: phó tổng giám đốc ngân hàng chính sách xã hội được xếp lương theo bảng lương như thế nào

Top 20 results:

1. Document ID: 8169
   Score: 42.1669
   Preview: điều phạm_vi và đối_tượng phạm_vi và đối_tượng áp_dụng cán_bộ công_chức viên_chức xếp lương theo bảng lương chuyên_gia cao_cấp các bảng lương chuyên_môn nghiệp_vụ thừa_hành phục_vụ làm_việc trong các ...

2. Document ID: 40804
   Score: 42.1669
   Preview: điều đối_tượng áp_dụng cán_bộ công_chức viên_chức người lao_động trong các cơ_quan tổ_chức đơn_vị của đảng nhà_nước tổ_chức chính_trị xã_hội từ trung_ương đến xã_phường thị_trấn sau đây gọi chung là c...

3. Document ID: 127461
   Score: 42.0707
   Preview: quan_điểm chỉ_đạo mục_tiêu và nội_dung cải_cách nội_dung cải_cách đối_với cán_bộ công_chức viên_chức và lực_lượng_vũ_trang khu_vực công_thiết_kế cơ_cấu tiền_lương mới gồm lương cơ_bản chiếm khoảng tổn...