In [2]:
#Download dependencies
import os
os.environ["WANDB_MODE"] = "disabled"

!pip install underthesea
!pip install faiss-cpu

Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.8

In [None]:
import pandas as pd
import numpy as np
import torch
import os
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from tqdm import tqdm


# --- Config ---
TRAIN_PATH = '/kaggle/input/train_tokenized.csv'
CORPUS_PATH = '/kaggle/input/corpus_tokenized.csv'
MODEL_SAVE_PATH = './finetuned_model'
OUTPUT_EMB = 'corpus_embeddings.npy'
OUTPUT_ID_MAP = 'corpus_id_mapping.csv'
EPOCHS = 15
# Reduce batch size to prevent OOM errors
BATCH_SIZE = 64
# Introduce gradient accumulation to maintain effective batch size
GRADIENT_ACCUMULATION = 4
NUM_WORKERS = 4  # Reduced to lower memory pressure
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
# Enable mixed precision training to reduce memory usage
USE_AMP = True

# --- Load corpus + train ---
corpus_df = pd.read_csv(CORPUS_PATH)
corpus_df['context_tokenized'] = corpus_df['context_tokenized'].astype(str)
cid_to_text = dict(zip(corpus_df['cid'], corpus_df['context_tokenized']))
train_df = pd.read_csv(TRAIN_PATH)

# --- Create training data more efficiently ---
train_data = []
for _, row in train_df.iterrows():
    q = str(row['question'])
    for cid_str in row['cid'].split(','):
        try:
            cid = int(cid_str.strip())
            if cid in cid_to_text:
                train_data.append(InputExample(texts=[q, cid_to_text[cid]]))
        except ValueError:
            print(f"Could not convert '{cid_str}' to integer")

# --- Load model with optimized settings ---
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
# Move model to device after configuration
model = model.to(DEVICE)

# --- Create DataLoader with optimized batch size ---
train_loader = DataLoader(
    train_data, 
    shuffle=True, 
    batch_size=BATCH_SIZE, 
    num_workers=NUM_WORKERS,
    pin_memory=True  # Faster data transfer to GPU
)
loss = losses.MultipleNegativesRankingLoss(model)
model.fit(
    train_objectives=[(train_loader, loss)],
    epochs=EPOCHS,
    warmup_steps=100,
    show_progress_bar=True,
    use_amp=USE_AMP,  # Use automatic mixed precision
    optimizer_params={'lr': 2e-5},  # Lower learning rate for stability
    weight_decay=0.01  # Add weight decay to reduce overfitting
)

# --- Save model for later inference ---
model.save(MODEL_SAVE_PATH)

# --- Encode full corpus with memory-efficient batching ---
# Process corpus in smaller chunks to avoid OOM
ENCODE_BATCH_SIZE = 32
texts = corpus_df['context_tokenized'].tolist()
ids = corpus_df['cid'].tolist()

# Encode in chunks to reduce memory usage
def encode_in_chunks(texts, chunk_size=ENCODE_BATCH_SIZE):
    all_embeddings = []
    for i in range(0, len(texts), chunk_size):
        chunk = texts[i:i+chunk_size]
        chunk_embeddings = model.encode(
            chunk,
            batch_size=chunk_size,
            convert_to_numpy=True,
            show_progress_bar=True,
            device=DEVICE
        )
        all_embeddings.append(chunk_embeddings)
        # Clear CUDA cache after each chunk
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    return np.vstack(all_embeddings)

embeddings = encode_in_chunks(texts)

# --- Save outputs ---
np.save(OUTPUT_EMB, embeddings)
pd.DataFrame({'cid': ids}).to_csv(OUTPUT_ID_MAP, index=False)

In [4]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from underthesea import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

# File paths based on the directory structure in the image
MODEL_PATH = '/kaggle/input/faissdata/finetuned_model'  # Use the final model or any epoch model
CORPUS_PATH = '/kaggle/input/datanlpnew/corpus_tokenized.csv'
EMBEDDINGS_PATH = '/kaggle/input/faissdata/corpus_embeddings.npy'
ID_MAP_PATH = '/kaggle/input/faissdata/corpus_id_mapping.csv'

# Load the fine-tuned model
print("Loading model...")
model = SentenceTransformer(MODEL_PATH)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# Load corpus
print("Loading corpus...")
corpus_df = pd.read_csv(CORPUS_PATH)
corpus_df['context_tokenized'] = corpus_df['context_tokenized'].astype(str)

# Load pre-computed corpus embeddings and IDs
print("Loading embeddings and ID mapping...")
corpus_embeddings = np.load(EMBEDDINGS_PATH)
id_df = pd.read_csv(ID_MAP_PATH)
corpus_ids = id_df['cid'].tolist()

# Function to perform inference
def search_similar_documents(query, top_k=10):
    # Tokenize the query using underthesea
    tokenized_query = word_tokenize(query)
    if isinstance(tokenized_query, list):
        tokenized_query = ' '.join(tokenized_query)
    print(f"Tokenized query: {tokenized_query}")
    
    # Encode the tokenized query
    query_embedding = model.encode(tokenized_query, convert_to_numpy=True, device=device)
    
    # Calculate similarity with all corpus documents
    similarities = cosine_similarity([query_embedding], corpus_embeddings)[0]
    
    # Get indices of top k most similar documents
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    # Return the top k most similar documents
    results = []
    for idx in top_indices:
        cid = corpus_ids[idx]
        # Find the corresponding document in corpus_df
        doc_text = corpus_df[corpus_df['cid'] == cid]['context_tokenized'].values[0]
        results.append({
            'cid': cid,
            'similarity': similarities[idx],
            'text': doc_text
        })
    
    return results

# Example usage
if __name__ == "__main__":
    # Example query (you can replace with any query)
    query = "phó tổng giám đốc ngân hàng chính sách xã hội được xếp lương theo bảng lương như thế nào"
    print(f"Original query: {query}")
    
    # Get top 10 most similar documents
    results = search_similar_documents(query, top_k=10)
    
    # Print the results
    print("\nTop 10 most relevant documents:")
    for i, doc in enumerate(results):
        print(f"\n{i+1}. CID: {doc['cid']}")
        print(f"   Similarity score: {doc['similarity']:.4f}")
        # Print first 150 characters of the document
        print(f"   Text: {doc['text'][:150]}...")


Loading model...
Loading corpus...
Loading embeddings and ID mapping...
Original query: phó tổng giám đốc ngân hàng chính sách xã hội được xếp lương theo bảng lương như thế nào
Tokenized query: phó tổng giám đốc ngân hàng chính sách xã hội được xếp lương theo bảng lương như thế nào


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Top 10 most relevant documents:

1. CID: 140864
   Similarity score: 0.8029
   Text: áp_dụng chế_độ tiền_lương và phụ_cấp quy_định tại nghị_định số cp ngày_tháng năm của chính_phủ quy_định tạm_thời chế_độ tiền_lương mới trong các doanh...

2. CID: 218047
   Similarity score: 0.6393
   Text: ghi_chú giám_đốc học_viện chính_trị quốc_gia hồ chí_minh tổng_biên_tập báo nhân_dân tổng_biên_tập tạp_chí cộng_sản_xếp mức lương chức_vụ theo quy_định...

3. CID: 159391
   Similarity score: 0.6239
   Text: điều_hành hoạt_động của ngân_hàng chính_sách_xã_hội là tổng_giám_đốc tổng_giám_đốc là đại_diện pháp_nhân của ngân_hàng chính_sách_xã_hội giúp_việc tổn...

4. CID: 140861
   Similarity score: 0.5793
   Text: phó tổng_giám_đốc là người giúp tổng_giám_đốc_điều_hành một hoặc một_số lĩnh_vực hoạt_động của ngân_hàng chính_sách_xã_hội theo phân_công của tổng_giá...

5. CID: 7557
   Similarity score: 0.5770
   Text: đối_tượng áp_dụng bảng lương cấp hàm_cơ_yếu những người hiện giữ chức_danh lãnh_đạo do b

In [5]:
import pandas as pd
import numpy as np
import faiss
import torch
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

# File paths
MODEL_PATH = '/kaggle/input/faissdata/finetuned_model'
TEST_PATH = '/kaggle/input/datanlpnew/test_set.csv'
CORPUS_PATH = '/kaggle/input/datanlpnew/corpus_tokenized.csv'
EMBEDDINGS_PATH = '/kaggle/input/faissdata/corpus_embeddings.npy'
ID_MAP_PATH = '/kaggle/input/faissdata/corpus_id_mapping.csv'

# Load model
print("Loading model...")
model = SentenceTransformer(MODEL_PATH)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

# Load test set
print("Loading test set...")
test_df = pd.read_csv(TEST_PATH)

# Load corpus
print("Loading corpus...")
corpus_df = pd.read_csv(CORPUS_PATH)
corpus_df['context_tokenized'] = corpus_df['context_tokenized'].astype(str)

# Load corpus embeddings and IDs
print("Loading corpus embeddings...")
corpus_embeddings = np.load(EMBEDDINGS_PATH)
id_df = pd.read_csv(ID_MAP_PATH)
corpus_ids = id_df['cid'].tolist()

# Build FAISS index
print("Building FAISS index...")
dimension = corpus_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Use inner product (cosine similarity for normalized vectors)
# Normalize vectors for cosine similarity
faiss.normalize_L2(corpus_embeddings)
index.add(corpus_embeddings)

# Define evaluation metrics - calculate overall MRR
def calculate_mrr(test_df, search_k=100):  # Use a large k to search through more results
    mrr_total = 0
    total_queries = 0
    
    # Create a single progress bar
    pbar = tqdm(total=len(test_df), desc="Evaluating MRR")
    
    for _, row in test_df.iterrows():
        # Use the already tokenized question
        tokenized_query = str(row['question_tokenized'])
        
        # Get relevant document IDs for this query
        try:
            relevant_cids = set(int(cid) for cid in str(row['cid']).split(','))
        except ValueError:
            # Skip if no valid CIDs
            pbar.update(1)
            continue
        
        # Encode query
        query_embedding = model.encode(tokenized_query, convert_to_numpy=True, device=device, show_progress_bar=False)
        
        # Normalize query vector for cosine similarity
        query_embedding = query_embedding.reshape(1, -1)
        faiss.normalize_L2(query_embedding)
        
        # Search using FAISS - use a larger k to find relevant docs that might be further down
        distances, indices = index.search(query_embedding, search_k)
        
        # Calculate rank of first relevant document
        rank = float('inf')
        for i, idx in enumerate(indices[0]):
            doc_cid = corpus_ids[idx]
            if doc_cid in relevant_cids:
                rank = i + 1
                break
        
        # Update MRR if a relevant document was found
        if rank < float('inf'):
            mrr_total += 1.0 / rank
        
        total_queries += 1
        pbar.update(1)
    
    pbar.close()
    
    # Calculate final MRR
    mrr = mrr_total / total_queries if total_queries > 0 else 0
    
    return mrr, total_queries

# Run evaluation
print("Starting evaluation...")
mrr, total_queries = calculate_mrr(test_df)

# Print results
print(f"\n===== FAISS Evaluation Results (Total Queries: {total_queries}) =====")
print(f"MRR: {mrr:.4f}")

# Also calculate top-k accuracy for common k values
def calculate_topk_accuracy(test_df, k_values=[5, 10, 20]):
    accuracy_scores = {k: 0 for k in k_values}
    total_queries = 0
    
    # Create a single progress bar
    pbar = tqdm(total=len(test_df), desc="Evaluating Top-K Accuracy")
    
    for _, row in test_df.iterrows():
        tokenized_query = str(row['question_tokenized'])
        
        try:
            relevant_cids = set(int(cid) for cid in str(row['cid']).split(','))
        except ValueError:
            pbar.update(1)
            continue
        
        query_embedding = model.encode(tokenized_query, convert_to_numpy=True, device=device, show_progress_bar=False)
        query_embedding = query_embedding.reshape(1, -1)
        faiss.normalize_L2(query_embedding)
        
        max_k = max(k_values)
        distances, indices = index.search(query_embedding, max_k)
        
        # Check if any relevant document is in top-k
        for k in k_values:
            top_k_indices = indices[0][:k]
            top_k_cids = [corpus_ids[idx] for idx in top_k_indices]
            if any(cid in relevant_cids for cid in top_k_cids):
                accuracy_scores[k] += 1
        
        total_queries += 1
        pbar.update(1)
    
    pbar.close()
    
    # Calculate final accuracies
    results = {}
    for k in k_values:
        results[f'Accuracy@{k}'] = accuracy_scores[k] / total_queries if total_queries > 0 else 0
    
    return results

# Calculate top-k accuracy
accuracy_metrics = calculate_topk_accuracy(test_df)
print("\n===== Top-K Accuracy =====")
for metric, value in accuracy_metrics.items():
    print(f"{metric}: {value:.4f}")
   

Loading model...
Loading test set...
Loading corpus...
Loading corpus embeddings...
Building FAISS index...
Starting evaluation...


Evaluating MRR:   0%|          | 0/23892 [00:00<?, ?it/s]


===== FAISS Evaluation Results (Total Queries: 23892) =====
MRR: 0.6459


Evaluating Top-K Accuracy:   0%|          | 0/23892 [00:00<?, ?it/s]


===== Top-K Accuracy =====
Accuracy@5: 0.8231
Accuracy@10: 0.9100
Accuracy@20: 0.9604
