In [1]:
!pip install gdown
!pip install -q pyvi



In [3]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    device = "cuda"
else:
    print("GPU is NOT available, using CPU instead.")
    device = "cpu"

GPU is available!
Device name: Tesla P100-PCIE-16GB
Number of GPUs: 1


In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd
import ast
from pyvi import ViTokenizer
from tqdm import tqdm
import numpy as np

# Word Segmentation for train set

In [None]:
tqdm.pandas()

df = pd.read_csv("/kaggle/input/soict-hackathon-2024-legal-document-retrieval/Train/train.csv")

def tokenize_text(text):
    if not isinstance(text, str):
        return ""
    return ViTokenizer.tokenize(text)

print("Tokenizing question...")
df["question"] = df["question"].progress_apply(tokenize_text)

print("Tokenizing context...")
def process_context(x):
    try:
        lst = ast.literal_eval(x)
        if isinstance(lst, list):
            return [ViTokenizer.tokenize(i) for i in lst]
        else:
            return [ViTokenizer.tokenize(str(lst))]
    except Exception:
        return [ViTokenizer.tokenize(str(x))]

df["context"] = df["context"].progress_apply(process_context)

print("Processing cid...")
def process_cid(x):
    try:
        return ast.literal_eval(x)
    except Exception:
        return [int(x)] if str(x).isdigit() else []
df["cid"] = df["cid"].progress_apply(process_cid)

df.to_csv("train_tokenized.csv", index=False)

print("Finished train_tokenized.csv")

In [None]:
tqdm.pandas()

df = pd.read_csv("/kaggle/working/train_tokenized.csv")

def clean_cid(x):
    try:
        val = ast.literal_eval(x)
        if isinstance(val, list) and len(val) > 0:
            return val[0]
        else:
            return np.nan
    except Exception:
        return np.nan

print("Cleaning cid column...")
df["cid"] = df["cid"].progress_apply(clean_cid)

df.to_csv("/kaggle/working/train_tokenized.csv", index=False)

print("Done! File saved as train_tokenized.csv")
print(df[["question","cid"]].head(10))

In [None]:
df = pd.read_csv("/kaggle/working/train_tokenized.csv")
df = df.dropna(subset=["cid"])
df["cid"] = df["cid"].astype(int)
df.to_csv("/kaggle/working/train_tokenized.csv", index=False)
print("Done! File saved as train_tokenized.csv")
print(df[["question","cid"]].head(10))

In [None]:
df = pd.read_csv("/kaggle/working/train_tokenized.csv")
df.head()

# Split train - val - test

In [5]:
df = pd.read_csv("/kaggle/input/datause/train_tokenized.csv")

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
train_df.to_csv("train_split.csv", index=False)
val_df.to_csv("val_split.csv", index=False)
test_df.to_csv("test_split.csv", index=False)

print(f"Train: {len(train_df)}")
print(f"Validation: {len(val_df)}")
print(f"Test: {len(test_df)}")

Train: 86575
Validation: 9620
Test: 10689


# Word Segmentation for corpus set

In [None]:
tqdm.pandas()

df = pd.read_csv("/kaggle/input/soict-hackathon-2024-legal-document-retrieval/Train/corpus.csv")

def tokenize_text(text):
    if not isinstance(text, str):
        return ""
    return ViTokenizer.tokenize(text)

print("Tokenizing corpus text...")
df["text"] = df["text"].progress_apply(tokenize_text)

df.to_csv("corpus_tokenized.csv", index=False)
print("Finished corpus_tokenized.csv")

# Finetune Bi-encoder

In [6]:
import pandas as pd
from datasets import Dataset
import ast

def create_contrastive_dataset(df):
    anchors = []
    positives = []
    for _, row in df.iterrows():
        q = str(row["question"])
        ctx = row["context"]
        if isinstance(ctx, str) and ctx.startswith("["):
            try:
                lst = ast.literal_eval(ctx)
                ctx = lst[0] if len(lst) > 0 else ""
            except:
                ctx = ""
        anchors.append(q)
        positives.append(str(ctx))
    return Dataset.from_dict({"anchor": anchors, "positive": positives})

train_df = pd.read_csv("/kaggle/working/train_split.csv").dropna(subset=["cid"])
val_df = pd.read_csv("/kaggle/working/val_split.csv").dropna(subset=["cid"])
test_df = pd.read_csv("/kaggle/working/test_split.csv").dropna(subset=["cid"])

# Convert
train_dataset = create_contrastive_dataset(train_df)
val_dataset = create_contrastive_dataset(val_df)
test_dataset = create_contrastive_dataset(test_df)

print(f"Train size: {len(train_dataset)}, Val size: {len(val_dataset)}, Test size: {len(test_dataset)}")

Train size: 86575, Val size: 9620, Test size: 10689


In [None]:
from sentence_transformers import SentenceTransformer, losses

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

# Pretrained Vietnamese Bi-Encoder
model_name = "bkai-foundation-models/vietnamese-bi-encoder"
model = SentenceTransformer(model_name, device=device)

# Loss function
loss = losses.CachedMultipleNegativesRankingLoss(model, mini_batch_size = 1024)
# loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
from transformers import get_scheduler
from torch.optim import AdamW
from transformers import EarlyStoppingCallback
from sentence_transformers import SentenceTransformerTrainingArguments, SentenceTransformerTrainer

args = SentenceTransformerTrainingArguments(
    output_dir="models/BKAI",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,
    learning_rate=2e-5,
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=300,
    save_steps=300,
    save_total_limit=2,
    metric_for_best_model="eval_loss",
    logging_steps=100,
    fp16=True,
    weight_decay=0.01,
    load_best_model_at_end=True,
    # dataloader_num_workers=0,
    report_to=[]
)

optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
num_samples = len(train_dataset)
num_training_steps = (num_samples // args.per_device_train_batch_size) * args.num_train_epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=int(len(train_dataset)*0.1), num_training_steps=num_training_steps)

early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.005)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    loss=loss,
    optimizers=(optimizer, scheduler),
    callbacks=[early_stopping]
)

trainer.train()

# Eval Metrics

In [None]:
import torch
from sentence_transformers import SentenceTransformer, losses
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer("/kaggle/input/bkai-checkpoint-2100/pytorch/default/1", device=device)

In [None]:
corpus_df = pd.read_csv("/kaggle/input/datause/corpus_tokenized.csv")
corpus_texts = corpus_df["text"].tolist()
corpus_cids = corpus_df["cid"].tolist()
corpus_embeddings = model.encode(corpus_texts, convert_to_tensor=True, device=device)

In [None]:
import torch

torch.save({
    "embeddings": corpus_embeddings,
    "cids": corpus_cids,
    "texts": corpus_texts
}, "/kaggle/working/corpus_embeddings.pt")

In [None]:
import torch
from torch.nn import functional as F
import pandas as pd

def evaluate_with_failures(model, test_df, corpus_embeddings, corpus_cids, k_list=[1,5,10,15,20,30,40,50,100,200], device="cuda"):
    model.eval()
    questions = test_df["question"].tolist()
    true_cids = test_df["cid"].tolist()

    q_emb = model.encode(questions, convert_to_tensor=True, device=device, show_progress_bar=True)
    q_emb = F.normalize(q_emb, p=2, dim=1)
    corpus_embeddings = F.normalize(corpus_embeddings, p=2, dim=1)

    scores = torch.matmul(q_emb, corpus_embeddings.T)

    recalls = {k: 0 for k in k_list}
    mrr_total = 0.0
    failed_cases = []

    for i, cid in enumerate(true_cids):
        topk_idx = torch.topk(scores[i], max(k_list)).indices
        topk_cids = [corpus_cids[idx] for idx in topk_idx.cpu().numpy()]

        # Recall@k
        for k in k_list:
            if cid in topk_cids[:k]:
                recalls[k] += 1

        # MRR
        if cid in topk_cids:
            rank = topk_cids.index(cid) + 1
            mrr_total += 1.0 / rank
        else:
            failed_cases.append({
                "question": questions[i],
                "true_cid": cid,
                "top100_cids": topk_cids[:100]
            })

    n = len(test_df)
    recalls = {k: v / n for k, v in recalls.items()}
    mrr = mrr_total / n

    return recalls, mrr, failed_cases

In [None]:
recalls, mrr, failed_cases = evaluate_with_failures(
    model, test_df, corpus_embeddings, corpus_cids, device=device
)

print("Test Metrics:")
for k, v in recalls.items():
    print(f"Recall@{k}: {v:.4f}")
print(f"MRR: {mrr:.4f}")

failed_df = pd.DataFrame(failed_cases)
failed_df.to_csv("failed_recall100.csv", index=False)
print(f"Failed cases @100: {len(failed_df)} saved to failed_recall100.csv")

# Use multi-query to increase recall 

In [None]:
data = torch.load("/kaggle/input/corpus-embedding/corpus_embeddings.pt", map_location="cuda")
corpus_embeddings = data["embeddings"]
corpus_cids = data["cids"]
corpus_texts = data["texts"]

print("Loaded embeddings:", corpus_embeddings.shape)

In [None]:
# import os
# import openai
# import torch
# import pandas as pd
# from torch.nn import functional as F
# from sentence_transformers import SentenceTransformer

# cosine_threshold = 0.6
# k_list = [1,5,10,15,20,30,40,50,100]

# api_key = input("Enter your OpenAI API key: ")
# openai.api_key = api_key

# test_df = pd.read_csv("/kaggle/working/test_split.csv").dropna(subset=["cid"])
# questions = test_df["question"].tolist()
# true_cids = test_df["cid"].tolist()

# q_emb = model.encode(questions, convert_to_tensor=True, device=device)
# q_emb = F.normalize(q_emb, p=2, dim=1)
# scores = torch.matmul(q_emb, corpus_embeddings.T)

# recalls = {k: 0 for k in k_list}
# mrr_total = 0.0
# low_confidence_questions = []

# for i, cid in enumerate(true_cids):
#     topk_idx = torch.topk(scores[i], max(k_list)).indices
#     topk_cids = [corpus_cids[idx] for idx in topk_idx.cpu().numpy()]
#     max_cosine = scores[i].max().item()

#     # Recall@k
#     for k in k_list:
#         if cid in topk_cids[:k]:
#             recalls[k] += 1

#     # MRR
#     if cid in topk_cids:
#         rank = topk_cids.index(cid) + 1
#         mrr_total += 1.0 / rank

#     if max_cosine < cosine_threshold:
#         low_confidence_questions.append({
#             "question": questions[i],
#             "true_cid": cid,
#             "top100_cids": topk_cids[:100],
#             "max_cosine": max_cosine
#         })

# def generate_paraphrases(query, n=3):
#     try:
#         response = openai.ChatCompletion.create(
#             model="gpt-4o-",
#             messages=[{"role": "user", "content": f"Paraphrase the following question in {n} different ways: {query}"}],
#             temperature=0.7
#         )
#         texts = response.choices[0].message.content.strip().split("\n")
#         return [t for t in texts if t]
#     except Exception as e:
#         print("GPT error:", e)
#         return []

# for item in low_confidence_questions:
#     paraphrases = generate_paraphrases(item["question"], n=3)
#     item["paraphrases"] = paraphrases

# all_queries = questions.copy()
# for item in low_confidence_questions:
#     all_queries.extend(item["paraphrases"])

# all_q_emb = model.encode(all_queries, convert_to_tensor=True, device=device)
# all_q_emb = F.normalize(all_q_emb, p=2, dim=1)
# scores_updated = torch.matmul(all_q_emb, corpus_embeddings.T)

# recalls_updated = {k: 0 for k in k_list}
# mrr_total_updated = 0.0

# for i, cid in enumerate(true_cids):
#     idxs = [i]
#     parap_count = len(low_confidence_questions[i]["paraphrases"]) if i < len(low_confidence_questions) else 0
#     idxs.extend(range(len(all_queries)-parap_count, len(all_queries)))

#     best_score = -1
#     best_rank = None
#     for idx in idxs:
#         topk_idx = torch.topk(scores_updated[idx], max(k_list)).indices
#         topk_cids = [corpus_cids[idx2] for idx2 in topk_idx.cpu().numpy()]

#         for k in k_list:
#             if cid in topk_cids[:k]:
#                 recalls_updated[k] += 1

#         if cid in topk_cids:
#             rank = topk_cids.index(cid) + 1
#             if best_score < 1.0/rank:
#                 best_score = 1.0/rank
#                 best_rank = rank

#     if best_rank is not None:
#         mrr_total_updated += best_score

# n = len(true_cids)
# recalls_updated = {k: v / n for k, v in recalls_updated.items()}
# mrr_updated = mrr_total_updated / n

# print("Updated Test Metrics after multi-query generation:")
# for k, v in recalls_updated.items():
#     print(f"Recall@{k}: {v:.4f}")
# print(f"MRR: {mrr_updated:.4f}")

# # pd.DataFrame(low_confidence_questions).to_csv("low_confidence_questions.csv", index=False)

# Finetune Cross-encoder

In [7]:
import torch
from sentence_transformers import SentenceTransformer, losses
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer("/kaggle/input/bkai-checkpoint-2100/pytorch/default/1", device=device)

2025-10-15 15:48:15.641091: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760543295.825154     132 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760543295.875089     132 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
import pandas as pd
train = pd.read_csv("/kaggle/input/datause/train_tokenized.csv")
train.head()

Unnamed: 0,question,context,cid,qid
0,Người học ngành quản_lý khai_thác công_trình t...,"['Khả_năng học_tập , nâng cao_trình_độ \n - Kh...",62492,161615
1,Nội_dung lồng_ghép vấn_đề bình_đẳng giới trong...,['Nội_dung lồng_ghép vấn_đề bình_đẳng giới tro...,151154,80037
2,Sản_phẩm phần_mềm có được hưởng ưu_đãi về thời...,"['"" Điều 20 . Ưu_đãi về thời_gian miễn thuế , ...",75071,124074
3,Điều_kiện để giáo_viên trong cơ_sở giáo_dục mầ...,"['Điều_kiện được hưởng \n Cán_bộ quản_lý , giá...",225897,146841
4,Nguyên_tắc áp_dụng phụ_cấp ưu_đãi nghề y_tế th...,['Nguyên_tắc áp_dụng \n 1 . Trường_hợp công_ch...,68365,6176


In [9]:
import torch

def load_corpus_embeddings(pt_path, device="cuda"):
    data = torch.load(pt_path, map_location=device)
    
    corpus_embeddings = data["embeddings"]
    corpus_cids = data["cids"]
    corpus_texts = data["texts"]
    
    print("Loaded embeddings:", corpus_embeddings.shape)
    print("Number of CIDs:", len(corpus_cids))
    print("Number of texts:", len(corpus_texts))
    
    return corpus_embeddings, corpus_cids, corpus_texts

embeddings, cids, texts = load_corpus_embeddings("/kaggle/input/corpus-embedding/corpus_embeddings.pt")

Loaded embeddings: torch.Size([261597, 768])
Number of CIDs: 261597
Number of texts: 261597


In [10]:
corpus = pd.read_csv("/kaggle/input/datause/corpus_tokenized.csv")
corpus.head()

Unnamed: 0,text,cid
0,"Thông_tư này hướng_dẫn tuần_tra , canh_gác bảo...",0
1,"1 . Hàng năm trước mùa mưa , lũ , Ủy_ban nhân_...",1
2,Tiêu_chuẩn của các thành_viên thuộc lực_lượng ...,2
3,"Nhiệm_vụ của lực_lượng tuần_tra , canh_gác đê ...",3
4,"Phù_hiệu của lực_lượng tuần_tra , canh_gác đê ...",4


In [11]:
import random
from tqdm import tqdm
from sentence_transformers import util

train['negative_cid'] = None
train['negative_context'] = None
batch_size = 32

for start_idx in tqdm(range(0, len(train), batch_size)):
    end_idx = min(start_idx + batch_size, len(train))
    batch = train.iloc[start_idx:end_idx]

    questions = batch['question'].tolist()
    query_embeddings = model.encode(questions, show_progress_bar=False)

    batch_hits = util.semantic_search(query_embeddings, embeddings, top_k=50)

    for i, (idx, row) in enumerate(batch.iterrows()):
        if isinstance(row['cid'], (list, tuple)):
            positive_cids = set(row['cid'])
        else:
            positive_cids = {row['cid']}

        candidates = []
        for hit in batch_hits[i]:
            cid = cids[hit['corpus_id']]
            if cid not in positive_cids:
                candidates.append(cid)

        if len(candidates) > 0:
            num_samples = min(10, len(candidates))
            neg_samples = random.sample(candidates, num_samples)
            train.at[idx, 'negative_cid'] = neg_samples
            train.at[idx, 'negative_context'] = [corpus.loc[corpus['cid'] == cid, 'text'].values[0] for cid in neg_samples]

100%|██████████| 3341/3341 [11:08<00:00,  5.00it/s]


In [12]:
train.to_csv("/kaggle/working/train_cross_encoder.csv", index=False)

In [13]:
df = pd.read_csv("/kaggle/working/train_cross_encoder.csv")
df.head()

Unnamed: 0,question,context,cid,qid,negative_cid,negative_context
0,Người học ngành quản_lý khai_thác công_trình t...,"['Khả_năng học_tập , nâng cao_trình_độ \n - Kh...",62492,161615,"[505847, 603422, 592734, 503390, 476410, 92590...",['Điều 1 . Ban_hành kèm theo Thông_tư này quy_...
1,Nội_dung lồng_ghép vấn_đề bình_đẳng giới trong...,['Nội_dung lồng_ghép vấn_đề bình_đẳng giới tro...,151154,80037,"[461115, 133919, 622165, 15113, 591096, 26109,...",['Khoản 3.1 - Nâng cao nhận_thức về công_tác p...
2,Sản_phẩm phần_mềm có được hưởng ưu_đãi về thời...,"['"" Điều 20 . Ưu_đãi về thời_gian miễn thuế , ...",75071,124074,"[233023, 631699, 629610, 561309, 498073, 73603...","['Ưu_đãi miễn , giảm thuế thu_nhập doanh_nghiệ..."
3,Điều_kiện để giáo_viên trong cơ_sở giáo_dục mầ...,"['Điều_kiện được hưởng \n Cán_bộ quản_lý , giá...",225897,146841,"[56406, 16908, 637221, 603297, 578013, 14311, ...",['Quyết_định này Quy_định về một_số chính_sách...
4,Nguyên_tắc áp_dụng phụ_cấp ưu_đãi nghề y_tế th...,['Nguyên_tắc áp_dụng \n 1 . Trường_hợp công_ch...,68365,6176,"[54637, 172835, 223959, 63970, 44729, 159194, ...","['"" Điều 3 . Nguyên_tắc áp_dụng \n 1 . Cán_bộ ..."


In [14]:
import pandas as pd
import ast

columns_to_convert = ["context", "cid", "negative_cid", "negative_context"]
for col in columns_to_convert:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

df['cid'] = df['cid'].apply(lambda x: x if isinstance(x, list) else [x])
df['negative_cid'] = df['negative_cid'].apply(lambda x: x if isinstance(x, list) else [x])

df.to_csv("/kaggle/working/train_cross_encoder.csv", index=False)
df.head()

Unnamed: 0,question,context,cid,qid,negative_cid,negative_context
0,Người học ngành quản_lý khai_thác công_trình t...,"[Khả_năng học_tập , nâng cao_trình_độ \n - Khố...",[62492],161615,"[505847, 603422, 592734, 503390, 476410, 92590...",[Điều 1 . Ban_hành kèm theo Thông_tư này quy_đ...
1,Nội_dung lồng_ghép vấn_đề bình_đẳng giới trong...,[Nội_dung lồng_ghép vấn_đề bình_đẳng giới tron...,[151154],80037,"[461115, 133919, 622165, 15113, 591096, 26109,...",[Khoản 3.1 - Nâng cao nhận_thức về công_tác ph...
2,Sản_phẩm phần_mềm có được hưởng ưu_đãi về thời...,"["" Điều 20 . Ưu_đãi về thời_gian miễn thuế , g...",[75071],124074,"[233023, 631699, 629610, 561309, 498073, 73603...","[Ưu_đãi miễn , giảm thuế thu_nhập doanh_nghiệp..."
3,Điều_kiện để giáo_viên trong cơ_sở giáo_dục mầ...,"[Điều_kiện được hưởng \n Cán_bộ quản_lý , giáo...",[225897],146841,"[56406, 16908, 637221, 603297, 578013, 14311, ...",[Quyết_định này Quy_định về một_số chính_sách ...
4,Nguyên_tắc áp_dụng phụ_cấp ưu_đãi nghề y_tế th...,[Nguyên_tắc áp_dụng \n 1 . Trường_hợp công_chứ...,[68365],6176,"[54637, 172835, 223959, 63970, 44729, 159194, ...","["" Điều 3 . Nguyên_tắc áp_dụng \n 1 . Cán_bộ ,..."


In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("/kaggle/input/train-cross-encoder/train_cross_encoder.csv")

train_data, eval_data = train_test_split(df, test_size=0.1, random_state=42)
print(f"Train size: {len(train_data)}, Eval size: {len(eval_data)}")

Train size: 96195, Eval size: 10689


In [None]:
# from sentence_transformers.readers import InputExample
# from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator

# #multi loss
# def create_contrastive_dataset(filtered_data):
#     train_samples = []

#     for index, row in filtered_data.iterrows():
#         query = row['question']
#         i = 0

#         for doc_positive in row['context']:
#             train_samples.append(InputExample(
#                     texts=[query, doc_positive],
#                     label=1  # positive pair
#                 ))

#         for doc_negative in row['negative_context']:
#             if i < 3: 
#                 train_samples.append(InputExample(
#                     texts=[query, doc_negative],
#                     label=0 # negative pair
#                 ))
#                 i+=1

#     return train_samples

# train_dataset = create_contrastive_dataset(train_data)
# print(f"Train dataset size: {len(train_dataset)}")
# validation_dataset = create_contrastive_dataset(eval_data)
# print(f"Val dataset size: {len(validation_dataset)}")

In [3]:
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator

def create_contrastive_dataset_v1(filtered_data, max_negatives_per_positive=2):
    train_samples = []
    for index, row in filtered_data.iterrows():
        query = row['question']
        
        positive_contexts = row['context'] if isinstance(row['context'], list) else [row['context']]
        negative_contexts = row['negative_context'] if isinstance(row['negative_context'], list) else [row['negative_context']]
        
        for doc_positive in positive_contexts:
            train_samples.append(InputExample(
                texts=[query, doc_positive],
                label=1
            ))
            
            for doc_negative in negative_contexts[:max_negatives_per_positive]:
                train_samples.append(InputExample(
                    texts=[query, doc_negative],
                    label=0
                ))
    
    return train_samples

train_dataset = create_contrastive_dataset_v1(train_data)
print(f"Train dataset size: {len(train_dataset)}")
validation_dataset = create_contrastive_dataset_v1(eval_data)
print(f"Val dataset size: {len(validation_dataset)}")

Train dataset size: 192390
Val dataset size: 21378


In [4]:
from sentence_transformers import SentenceTransformer,CrossEncoder, SentenceTransformerTrainer, losses, SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers
from torch.optim import AdamW
import torch
from torch import nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CrossEncoder('itdainb/PhoRanker', max_length=256, num_labels=1)
model.model.to(device)

config.json:   0%|          | 0.00/814 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [5]:
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
evaluator = CEBinaryClassificationEvaluator.from_input_examples(validation_dataset, name="Quora-dev")
#loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([10.0])).to(device)
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [6]:
from transformers import logging

logging.set_verbosity_error()

In [7]:
# from transformers import EarlyStoppingCallback

# early_stop = EarlyStoppingCallback(
#     monitor='val_loss',
#     patience=3,
#     min_delta=0.0001
# )

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=2,
    loss_fct = loss_fn,
    warmup_steps=int(len(train_dataset) * 0.1),
    optimizer_params={'lr': 3e-4, 'weight_decay': 0.01},
    evaluation_steps=1000,
    save_best_model=True,
    output_path='models/PhoRanker_3_rand_2',
    show_progress_bar=True,
    callback=None,
)

Step,Training Loss,Validation Loss,Quora-dev Accuracy,Quora-dev Accuracy Threshold,Quora-dev F1,Quora-dev F1 Threshold,Quora-dev Precision,Quora-dev Recall,Quora-dev Average Precision
1000,0.2919,No log,0.946721,0.0076,0.946271,0.0076,0.954329,0.938348,0.990273
2000,0.2345,No log,0.942277,0.981106,0.941962,0.981106,0.947129,0.936851,0.987568
3000,0.2472,No log,0.946393,0.974146,0.946263,0.973831,0.948576,0.943961,0.991472


Batches:   0%|          | 0/669 [00:00<?, ?it/s]

Batches:   0%|          | 0/669 [00:00<?, ?it/s]

Batches:   0%|          | 0/669 [00:00<?, ?it/s]

Batches:   0%|          | 0/669 [00:00<?, ?it/s]

# Create data for test

In [None]:
import requests
from bs4 import BeautifulSoup
import openai
import json
import time
import random

openai.api_key = "***"

URL = "https://thuvienphapluat.vn/van-ban/Giao-duc/Thong-tu-21-2025-TT-BGDDT-che-do-tra-tien-luong-day-them-gio-nha-giao-trong-cac-co-so-giao-duc-cong-lap-673797.aspx"
res = requests.get(URL)
soup = BeautifulSoup(res.text, "html.parser")

content_div = soup.find("div", {"id": "divNoiDung"})
text = content_div.get_text(separator="\n").strip()

def chunk_text(text, max_words=700):
    words = text.split()
    for i in range(0, len(words), max_words):
        yield " ".join(words[i:i+max_words])

chunks = list(chunk_text(text))

def generate_qa_block(context, cid_start, qid_start):
    prompt = f"""
Tạo khoảng 5 cặp dữ liệu dạng JSON với cấu trúc:
[
  {{
    "question": "...",
    "context": "...",
    "cid": number,
    "qid": number
  }},
  ...
]

Trong đó:
- question là câu hỏi ngắn, rõ, liên quan nội dung bên dưới
- context là phần trích ngắn từ đoạn văn, chứa đủ thông tin trả lời
- cid và qid là số duy nhất (cid cho context, qid cho từng câu hỏi)
Đây là đoạn văn:
{context}
"""
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )
    try:
        data = json.loads(response.choices[0].message.content)
    except:
        data = []
    return data

dataset = []
cid_counter = 60000
qid_counter = 160000

for chunk in chunks[:200]:
    cid_counter += 1
    qid_counter += 1
    data = generate_qa_block(chunk, cid_counter, qid_counter)
    for d in data:
        d["cid"] = cid_counter
        d["qid"] = qid_counter
        qid_counter += 1
        dataset.append(d)
    time.sleep(1)

with open("qa_dataset.jsonl", "w", encoding="utf-8") as f:
    for item in dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"{len(dataset)}.")

# Final output MRR@

In [None]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np
import ast

data = torch.load("/kaggle/input/corpus-embedding/corpus_embeddings.pt", map_location="cuda")
corpus_embeddings = data["embeddings"]
corpus_cids = data["cids"]
corpus_texts = data["texts"]
print("Loaded embeddings:", corpus_embeddings.shape)

test = pd.read_csv("/kaggle/working/test_split.csv")
test['cid'] = test['cid'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [x])

# Bi-encoder
bi_encoder = SentenceTransformer("/kaggle/input/bkai-checkpoint-2100/pytorch/default/1")

# Cross-encoder
cross_encoder = SentenceTransformer("/kaggle/working/models/PhoRanker_3_rand_2")

questions = test['question'].tolist()
question_embeddings = bi_encoder.encode(questions, convert_to_tensor=True, device='cuda')

top_k = 100
hits = util.semantic_search(question_embeddings, corpus_embeddings, top_k=top_k)

reranked_hits = []
for i, row in enumerate(test.itertuples()):
    q = row.question
    top_candidates = hits[i][:10]

    cross_inputs = []
    cids_list = []
    for hit in top_candidates:
        cid = corpus_cids[hit['corpus_id']]
        context = corpus_texts[hit['corpus_id']]
        cross_inputs.append([q, context])
        cids_list.append(cid)

    scores = cross_encoder.predict(cross_inputs, convert_to_tensor=True).cpu().numpy()
    sorted_indices = np.argsort(-scores)
    sorted_cids = [cids_list[idx] for idx in sorted_indices]
    reranked_hits.append(sorted_cids)

mrr_total = 0.0
for i, row in enumerate(test.itertuples()):
    pos_cids = row.cid if isinstance(row.cid, list) else [row.cid]
    retrieved = reranked_hits[i]

    rank = 0
    for idx, cid in enumerate(retrieved, start=1):
        if cid in pos_cids:
            rank = idx
            break
    if rank > 0:
        mrr_total += 1.0 / rank

mrr_score = mrr_total / len(test)
print("MRR:", mrr_score)

MRR: 0.7682
