In [None]:
# Install Dependencies

!pip uninstall -y numpy
!pip install --force-reinstall numpy==1.26.4

!pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 sentence-transformers==2.2.2
!pip install pandas==2.0.0
!pip install transformers==4.41.0 scikit-learn==1.2.0
!pip install huggingface-hub==0.25.2
!pip install nltk==3.8.1 rouge-score==0.1.2 bert-score==0.3.13 -q
!pip install tqdm==4.66.5 -q
!pip install lime==0.2.0.1

Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m101.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.26.4
Collecting torch==2.2.1
  Downloading torch-2.2.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision==0.17.1
  Downloading torchvision-0.17.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.2.1
  Downloading torchaudio-2.2.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting sentenc

In [None]:
# Setup and Imports

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    BartForConditionalGeneration, BartTokenizer,
    DPRContextEncoder, DPRQuestionEncoder,
    DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer
)
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import os
from google.colab import drive
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import bert_score
import lime
import lime.lime_text
import string
import nltk
import json

nltk.download('wordnet')
nltk.download('punkt')

# Mount Google Drive
drive.mount('/content/drive')

# Configuration (aligned with previous steps)
class Config:
    BASE_PATH = "/content/drive/MyDrive/LJMU-Datasets"
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    BART_MODEL_NAME = "facebook/bart-base"
    DPR_CTX_MODEL_NAME = "facebook/dpr-ctx_encoder-single-nq-base"
    DPR_QUESTION_MODEL_NAME = "facebook/dpr-question_encoder-single-nq-base"
    BATCH_SIZE = 8
    MAX_EPOCHS = 3
    NUM_WORKERS = 4
    MAX_LENGTH = 256
    SUBSET_SIZE = 500
    HOTPOTQA_MAX_SAMPLES = 1000
    WIKIDATA_SUBSET_SIZE = 30000

CONFIG = Config()
print(f"Using device: {CONFIG.DEVICE}")

# Clear GPU memory
torch.cuda.empty_cache()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive
Using device: cuda


In [None]:
# Load Artifacts from Step 3 and Rebuild DataLoaders

import pickle

save_path = '/content/drive/MyDrive/bert_retrieval_artifacts_v3'

# Load raw datasets
qa_train_path = os.path.join(CONFIG.BASE_PATH, "qa_train_v3.csv")
qa_val_path = os.path.join(CONFIG.BASE_PATH, "qa_val_v3.csv")
triple_train_path = os.path.join(CONFIG.BASE_PATH, "triple_train_v3.csv")

qa_train_df = pd.read_csv(qa_train_path)
qa_val_df = pd.read_csv(qa_val_path)
triple_train_df = pd.read_csv(triple_train_path)

# Split triple_train_df into train and validation sets (80/20)
triple_train_df, triple_val_df = train_test_split(triple_train_df, train_size=0.8, random_state=42)
print(f"Triple Train Size: {len(triple_train_df)}, Triple Val Size: {len(triple_val_df)}")

# Load all_candidates
with open(os.path.join(save_path, 'all_candidates_v3.pkl'), 'rb') as f:
    all_candidates = pickle.load(f)

# Define RetrievalDataset class
class RetrievalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, bart_tokenizer: BartTokenizer, dpr_question_tokenizer: DPRQuestionEncoderTokenizer,
                 max_length: int = 256, task: str = "qa", candidate_objects: list = None):
        self.bart_tokenizer = bart_tokenizer
        self.dpr_question_tokenizer = dpr_question_tokenizer
        self.max_length = max_length
        self.task = task
        self.data = df
        self.candidate_objects = candidate_objects

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        question = row["question"]
        context = row["context"]
        answer = row["answer"]

        if self.task == "qa":
            bart_input_text = f"question: {question} context: {context}"
        else:
            bart_input_text = f"question: {question} context: {context}"
        bart_inputs = self.bart_tokenizer(
            bart_input_text,
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
            padding="max_length"
        )
        bart_labels = self.bart_tokenizer(
            answer,
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
            padding="max_length"
        )

        dpr_inputs = self.dpr_question_tokenizer(
            question,
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
            padding="max_length"
        )

        item = {
            "task": self.task,
            "bart_input_ids": bart_inputs["input_ids"].squeeze(),
            "bart_attention_mask": bart_inputs["attention_mask"].squeeze(),
            "bart_labels": bart_labels["input_ids"].squeeze(),
            "dpr_input_ids": dpr_inputs["input_ids"].squeeze(),
            "dpr_attention_mask": dpr_inputs["attention_mask"].squeeze(),
            "question": question,
            "answer": answer
        }

        if self.task == "triple" and self.candidate_objects:
            label_idx = self.candidate_objects.index(answer) if answer in self.candidate_objects else -1
            item["label_idx"] = label_idx

        return item

# Create DataLoaders from raw data
bart_tokenizer = BartTokenizer.from_pretrained(CONFIG.BART_MODEL_NAME)
dpr_question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(CONFIG.DPR_QUESTION_MODEL_NAME)

qa_train_dataset = RetrievalDataset(qa_train_df, bart_tokenizer, dpr_question_tokenizer, task="qa", candidate_objects=all_candidates)
qa_val_dataset = RetrievalDataset(qa_val_df, bart_tokenizer, dpr_question_tokenizer, task="qa", candidate_objects=all_candidates)
triple_train_dataset = RetrievalDataset(triple_train_df, bart_tokenizer, dpr_question_tokenizer, task="triple", candidate_objects=all_candidates)
triple_val_dataset = RetrievalDataset(triple_val_df, bart_tokenizer, dpr_question_tokenizer, task="triple", candidate_objects=all_candidates)

qa_train_loader = DataLoader(qa_train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True, num_workers=CONFIG.NUM_WORKERS)
qa_val_loader = DataLoader(qa_val_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False, num_workers=CONFIG.NUM_WORKERS)
triple_train_loader = DataLoader(triple_train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True, num_workers=CONFIG.NUM_WORKERS)
triple_val_loader = DataLoader(triple_val_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=False, num_workers=CONFIG.NUM_WORKERS)

print(f"Created QA DataLoaders: QA Train={len(qa_train_dataset)}, QA Val={len(qa_val_dataset)}")
print(f"Created Triple DataLoaders: Triple Train={len(triple_train_dataset)}, Triple Val={len(triple_val_dataset)}")

# Load BART models and tokenizer
bart_qa_model = BartForConditionalGeneration.from_pretrained(CONFIG.BART_MODEL_NAME).to(CONFIG.DEVICE)
bart_qa_model.load_state_dict(torch.load(os.path.join(save_path, 'bart_qa_v3.pt')))
bart_triple_model = BartForConditionalGeneration.from_pretrained(CONFIG.BART_MODEL_NAME).to(CONFIG.DEVICE)
bart_triple_model.load_state_dict(torch.load(os.path.join(save_path, 'bart_triple_v3.pt')))
bart_tokenizer = BartTokenizer.from_pretrained(CONFIG.BART_MODEL_NAME)
bart_qa_model.eval()
bart_triple_model.eval()

# Load DPR models and tokenizers (post-RL fine-tuning)
ctx_encoder_qa = DPRContextEncoder.from_pretrained(os.path.join(CONFIG.BASE_PATH, "dpr_ctx_encoder_rl_qa_v3")).to(CONFIG.DEVICE)
question_encoder_qa = DPRQuestionEncoder.from_pretrained(os.path.join(CONFIG.BASE_PATH, "dpr_question_encoder_rl_qa_v3")).to(CONFIG.DEVICE)
ctx_encoder_triple = DPRContextEncoder.from_pretrained(os.path.join(CONFIG.BASE_PATH, "dpr_ctx_encoder_rl_triple_v3")).to(CONFIG.DEVICE)
question_encoder_triple = DPRQuestionEncoder.from_pretrained(os.path.join(CONFIG.BASE_PATH, "dpr_question_encoder_rl_triple_v3")).to(CONFIG.DEVICE)
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(CONFIG.DPR_CTX_MODEL_NAME)
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(CONFIG.DPR_QUESTION_MODEL_NAME)
candidate_embeddings_qa = torch.load(os.path.join(save_path, 'dpr_candidate_embeddings_v3.pt')).to(CONFIG.DEVICE)
candidate_embeddings_triple = torch.load(os.path.join(save_path, 'dpr_candidate_embeddings_triple_v3.pt')).to(CONFIG.DEVICE)

# Load sentence transformer
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

print("Artifacts loaded from Step 3 and DataLoaders rebuilt.")

Triple Train Size: 4000, Triple Val Size: 1000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

Created QA DataLoaders: QA Train=1400, QA Val=100
Created Triple DataLoaders: Triple Train=4000, Triple Val=1000


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O1.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O2.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O3.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O4.onnx:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

model_qint8_arm64.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_quint8_avx2.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

openvino_model.xml:   0%|          | 0.00/211k [00:00<?, ?B/s]

openvino_model_qint8_quantized.bin:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

openvino_model_qint8_quantized.xml:   0%|          | 0.00/368k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Artifacts loaded from Step 3 and DataLoaders rebuilt.


In [None]:
# Define Helper Functions

# Normalize text for evaluation
def normalize_text(text: str) -> str:
    text = str(text).lower().strip()
    text = text.translate(str.maketrans("", "", string.punctuation))
    articles = {'a', 'an', 'the'}
    words = text.split()
    words = [word for word in words if word not in articles]
    return ' '.join(words)

# Compute BLEU score
def compute_bleu(generated: str, reference: str) -> float:
    return sentence_bleu([reference.split()], generated.split())

# Compute ROUGE-L score
def compute_rouge_l(generated: str, reference: str) -> float:
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    return scorer.score(reference, generated)['rougeL'].fmeasure

# Compute BERTScore
def compute_bertscore(generated: str, reference: str) -> float:
    return bert_score.score([generated], [reference], lang="en", verbose=False)[2].mean().item()

print("Helper functions defined.")

Helper functions defined.


In [None]:
# Explainability with LIME and Custom Permutation SHAP

# Function to compute DPR similarity scores for a given question embedding and candidates
def compute_dpr_similarities_from_embedding(question_embedding, candidates, ctx_encoder, tokenizer):
    with torch.no_grad():
        question_embedding = torch.tensor(question_embedding, dtype=torch.float32).to(CONFIG.DEVICE).unsqueeze(0)  # Shape: (1, 768)

        candidate_inputs = tokenizer(
            candidates,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=CONFIG.MAX_LENGTH
        )
        candidate_inputs = {k: v.to(CONFIG.DEVICE) for k, v in candidate_inputs.items()}
        candidate_embeddings = ctx_encoder(**candidate_inputs).pooler_output  # Shape: (num_candidates, 768)

        similarities = torch.matmul(question_embedding, candidate_embeddings.T).squeeze(0)  # Shape: (num_candidates,)
        return similarities.cpu().numpy()

# Function to convert text to embeddings using DPR
def text_to_dpr_embedding(texts, question_encoder, tokenizer):
    if isinstance(texts, str):
        texts = [texts]
    dpr_inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=CONFIG.MAX_LENGTH
    )
    dpr_inputs = {k: v.to(CONFIG.DEVICE) for k, v in dpr_inputs.items()}
    with torch.no_grad():
        dpr_embeddings = question_encoder(**dpr_inputs).pooler_output  # Shape: (n_texts, 768)
    return dpr_embeddings.cpu().numpy()

# Wrapper function for LIME (works with raw text)
def dpr_predict(texts, question_encoder, candidates, ctx_encoder, tokenizer):
    embeddings = text_to_dpr_embedding(texts, question_encoder, tokenizer)
    similarities_list = []
    for emb in embeddings:
        similarities = compute_dpr_similarities_from_embedding(emb, candidates, ctx_encoder, tokenizer)
        similarities_list.append(similarities)
    return np.array(similarities_list)

# Wrapper function for LIME to handle perturbed samples
def lime_dpr_predict(texts, question_encoder, candidates, ctx_encoder, tokenizer):
    if not isinstance(texts, list):
        texts = [texts]
    num_samples = len(texts)
    embeddings = text_to_dpr_embedding(texts, question_encoder, tokenizer)
    similarities_list = []
    for emb in embeddings:
        similarities = compute_dpr_similarities_from_embedding(emb, candidates, ctx_encoder, tokenizer)
        similarities_list.append(similarities)
    similarities_array = np.array(similarities_list)  # Shape: (num_samples, num_candidates)
    probs = (similarities_array - similarities_array.min()) / (similarities_array.max() - similarities_array.min())
    return np.vstack([1 - probs, probs]).T  # Shape: (num_samples, 2)

# Custom permutation-based SHAP approximation
def custom_permutation_shap(question, candidates, question_encoder, ctx_encoder, tokenizer, num_permutations=100):
    baseline_similarities = dpr_predict([question], question_encoder, candidates, ctx_encoder, tokenizer)[0]
    feature_importance = np.zeros(len(candidates))
    original_idx = np.argmax(baseline_similarities)

    for _ in range(num_permutations):
        permuted_candidates = candidates.copy()
        np.random.shuffle(permuted_candidates)
        permuted_similarities = dpr_predict([question], question_encoder, permuted_candidates, ctx_encoder, tokenizer)[0]
        permuted_idx = np.argmax(permuted_similarities)
        importance = abs(baseline_similarities[original_idx] - permuted_similarities[permuted_idx])
        feature_importance += importance
    return feature_importance / num_permutations

# Select examples for explainability (10 QA, 10 triple)
qa_examples = qa_val_df.sample(n=10, random_state=42)
triple_examples = triple_val_df.sample(n=10, random_state=42)

# LIME Explainability (Primary Method)
print("Computing LIME explanations...")
lime_explainer = lime.lime_text.LimeTextExplainer(class_names=["similarity_score"])

lime_explanations_qa = []
lime_explanations_triple = []

# LIME for QA examples
for i, row in qa_examples.iterrows():
    question = row["question"]
    explanation = lime_explainer.explain_instance(
        question,
        lambda texts: lime_dpr_predict(texts, question_encoder_qa, all_candidates[:100], ctx_encoder_qa, question_tokenizer),
        num_features=5,
        num_samples=100
    )
    lime_explanations_qa.append((question, explanation.as_list()))
    print(f"LIME computed for QA example {i+1}/10")

# LIME for triple examples
for i, row in triple_examples.iterrows():
    question = row["question"]
    explanation = lime_explainer.explain_instance(
        question,
        lambda texts: lime_dpr_predict(texts, question_encoder_triple, all_candidates[:100], ctx_encoder_triple, question_tokenizer),
        num_features=5,
        num_samples=100
    )
    lime_explanations_triple.append((question, explanation.as_list()))
    print(f"LIME computed for triple example {i+1}/10")

# Custom Permutation SHAP (Secondary Method)
print("Computing custom permutation SHAP explanations...")
permutation_shap_qa = []
permutation_shap_triple = []

# Permutation SHAP for QA examples
for i, row in qa_examples.iterrows():
    question = row["question"]
    importance = custom_permutation_shap(question, all_candidates[:100], question_encoder_qa, ctx_encoder_qa, question_tokenizer)
    permutation_shap_qa.append((question, importance))
    print(f"Permutation SHAP computed for QA example {i+1}/10")

# Permutation SHAP for triple examples
for i, row in triple_examples.iterrows():
    question = row["question"]
    importance = custom_permutation_shap(question, all_candidates[:100], question_encoder_triple, ctx_encoder_triple, question_tokenizer)
    permutation_shap_triple.append((question, importance))
    print(f"Permutation SHAP computed for triple example {i+1}/10")

# Save explainability results
explainability_results = {
    "lime_qa": lime_explanations_qa,
    "lime_triple": lime_explanations_triple,
    "permutation_shap_qa": [(q, v.tolist()) for q, v in permutation_shap_qa],
    "permutation_shap_triple": [(q, v.tolist()) for q, v in permutation_shap_triple]
}

explainability_path = os.path.join(CONFIG.BASE_PATH, "explainability_results_v3.json")
with open(explainability_path, "w") as f:
    json.dump(explainability_results, f)
print(f"Saved explainability results at {explainability_path}")

# Print sample explanations
print("Sample LIME Explanations (QA):")
for question, explanation in lime_explanations_qa[:2]:
    print(f"Question: {question}")
    print(f"LIME Explanation: {explanation}\n")

print("Sample LIME Explanations (Triple):")
for question, explanation in lime_explanations_triple[:2]:
    print(f"Question: {question}")
    print(f"LIME Explanation: {explanation}\n")

print("Sample Permutation SHAP Explanations (QA):")
for question, importance in permutation_shap_qa[:2]:
    print(f"Question: {question}")
    print(f"Permutation SHAP Importance: {importance[:5]}...\n")

print("Sample Permutation SHAP Explanations (Triple):")
for question, importance in permutation_shap_triple[:2]:
    print(f"Question: {question}")
    print(f"Permutation SHAP Importance: {importance[:5]}...\n")

Computing LIME explanations...
LIME computed for QA example 84/10
LIME computed for QA example 54/10
LIME computed for QA example 71/10
LIME computed for QA example 46/10
LIME computed for QA example 45/10
LIME computed for QA example 40/10
LIME computed for QA example 23/10
LIME computed for QA example 81/10
LIME computed for QA example 11/10
LIME computed for QA example 1/10
LIME computed for triple example 3708/10
LIME computed for triple example 829/10
LIME computed for triple example 2665/10
LIME computed for triple example 1048/10
LIME computed for triple example 3198/10
LIME computed for triple example 4201/10
LIME computed for triple example 1506/10
LIME computed for triple example 997/10
LIME computed for triple example 2751/10
LIME computed for triple example 4446/10
Computing custom permutation SHAP explanations...
Permutation SHAP computed for QA example 84/10
Permutation SHAP computed for QA example 54/10
Permutation SHAP computed for QA example 71/10
Permutation SHAP comp

In [None]:
# (Part 1): Quantitative Validation - Evaluate BART

# Evaluate BART on both QA and triple tasks
def evaluate_bart(model, val_loader, task: str = "qa"):
    print(f"Evaluating BART for {task}...")
    model.eval()
    bleu_scores, rouge_scores, bert_scores = [], [], []
    sample_outputs = []
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Evaluating {task}"):
            input_ids = batch["bart_input_ids"].to(CONFIG.DEVICE)
            attention_mask = batch["bart_attention_mask"].to(CONFIG.DEVICE)
            references = batch["answer"]
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=100,
                num_beams=20,
                temperature=0.5,
                no_repeat_ngram_size=2
            )
            generated_texts = [bart_tokenizer.decode(g_ids, skip_special_tokens=True) for g_ids in generated_ids]
            for gen, ref in zip(generated_texts, references):
                gen = normalize_text(gen)
                ref = normalize_text(ref)
                bleu = compute_bleu(gen, ref)
                rouge = compute_rouge_l(gen, ref)
                bert_f1 = compute_bertscore(gen, ref)
                bleu_scores.append(bleu)
                rouge_scores.append(rouge)
                bert_scores.append(bert_f1)
                sample_outputs.append((gen, ref))
            del input_ids, attention_mask, generated_ids
            torch.cuda.empty_cache()
    avg_bleu = np.mean(bleu_scores)
    avg_rouge = np.mean(rouge_scores)
    avg_bert = np.mean(bert_scores)
    print(f"BART {task} Evaluation:")
    print(f"Average BLEU: {avg_bleu:.4f}")
    print(f"Average ROUGE-L: {avg_rouge:.4f}")
    print(f"Average BERTScore F1: {avg_bert:.4f}")
    print(f"Sample Outputs (First 5) for {task}:")
    for gen, ref in sample_outputs[:5]:
        print(f"Generated: {gen}")
        print(f"Reference: {ref}\n")
    return avg_bleu, avg_rouge, avg_bert

# Evaluate BART on QA and triple tasks
bart_qa_bleu, bart_qa_rouge, bart_qa_bert = evaluate_bart(bart_qa_model, qa_val_loader, task="qa")
bart_triple_bleu, bart_triple_rouge, bart_triple_bert = evaluate_bart(bart_triple_model, triple_val_loader, task="triple")

Evaluating BART for qa...


Evaluating qa:   0%|          | 0/13 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model check

BART qa Evaluation:
Average BLEU: 0.0000
Average ROUGE-L: 0.0000
Average BERTScore F1: 0.7867
Sample Outputs (First 5) for qa:
Generated: answer
Reference: 80

Generated: answer
Reference: unanswerable

Generated: answer
Reference: external senses

Generated: answer
Reference: wellington college

Generated: answer
Reference: 200 million kilowatt hours

Evaluating BART for triple...


Evaluating triple:   0%|          | 0/125 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model 

BART triple Evaluation:
Average BLEU: 0.0000
Average ROUGE-L: 0.0495
Average BERTScore F1: 0.8222
Sample Outputs (First 5) for triple:
Generated: sce
Reference: enschede

Generated: omine
Reference: menominee

Generated: oul
Reference: seoul

Generated: oth
Reference: gotha

Generated: orzh
Reference: voronezh






In [None]:
# (Part 2): Quantitative Validation - Evaluate DPR

# Evaluate DPR on both QA and triple tasks at k=1, 5, 10
def evaluate_dpr_k(ctx_encoder, question_encoder, val_loader, candidates, k_values=[1, 5, 10], task: str = "qa"):
    ctx_encoder.eval()
    question_encoder.eval()
    mrr_k, precision_k = {k: [] for k in k_values}, {k: [] for k in k_values}
    eval_candidates = candidates
    print(f"Task: {task}, Using candidate pool size: {len(eval_candidates)}")
    candidate_inputs = ctx_tokenizer(eval_candidates, return_tensors="pt", padding=True, truncation=True, max_length=CONFIG.MAX_LENGTH)
    candidate_inputs = {k: v.to(CONFIG.DEVICE) for k, v in candidate_inputs.items()}
    with torch.no_grad():
        candidate_embeddings = ctx_encoder(**candidate_inputs).pooler_output
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Evaluating DPR {task}"):
            question_inputs = {
                "input_ids": batch["dpr_input_ids"].to(CONFIG.DEVICE),
                "attention_mask": batch["dpr_attention_mask"].to(CONFIG.DEVICE)
            }
            references = batch["answer"]
            question_embeddings = question_encoder(**question_inputs).pooler_output
            similarities = torch.matmul(question_embeddings, candidate_embeddings.T)
            rankings = torch.argsort(similarities, dim=1, descending=True)
            for i, (ranking, ref) in enumerate(zip(rankings, references)):
                ref_idx = eval_candidates.index(ref) if ref in eval_candidates else -1
                if ref_idx == -1:
                    continue
                for k in k_values:
                    top_k_indices = ranking[:k]
                    rank = (top_k_indices == ref_idx).nonzero(as_tuple=True)[0].item() + 1 if ref_idx in top_k_indices else len(eval_candidates)
                    mrr_k[k].append(1.0 / rank)
                    precision_k[k].append(1.0 if ref_idx in top_k_indices else 0.0)
            del question_inputs, similarities, rankings
            torch.cuda.empty_cache()
    results = {}
    for k in k_values:
        avg_mrr = np.mean(mrr_k[k])
        avg_precision = np.mean(precision_k[k])
        results[f"mrr_at_{k}"] = avg_mrr
        results[f"precision_at_{k}"] = avg_precision
        print(f"DPR Evaluation ({task}) at k={k}:")
        print(f"MRR@{k}: {avg_mrr:.4f}")
        print(f"Precision@{k}: {avg_precision:.4f}")
    return results

# Evaluate DPR on QA and triple tasks
dpr_qa_metrics = evaluate_dpr_k(ctx_encoder_qa, question_encoder_qa, qa_val_loader, all_candidates, task="qa")
dpr_triple_metrics = evaluate_dpr_k(ctx_encoder_triple, question_encoder_triple, triple_val_loader, all_candidates, task="triple")

Task: qa, Using candidate pool size: 2024


Evaluating DPR qa: 100%|██████████| 13/13 [00:01<00:00,  9.96it/s]


DPR Evaluation (qa) at k=1:
MRR@1: 0.3103
Precision@1: 0.3100
DPR Evaluation (qa) at k=5:
MRR@5: 0.3901
Precision@5: 0.5200
DPR Evaluation (qa) at k=10:
MRR@10: 0.3993
Precision@10: 0.5900
Task: triple, Using candidate pool size: 2024


Evaluating DPR triple: 100%|██████████| 125/125 [00:04<00:00, 25.18it/s]

DPR Evaluation (triple) at k=1:
MRR@1: 0.3653
Precision@1: 0.3650
DPR Evaluation (triple) at k=5:
MRR@5: 0.4492
Precision@5: 0.5830
DPR Evaluation (triple) at k=10:
MRR@10: 0.4589
Precision@10: 0.6580





In [None]:
# (Part 3): Quantitative Validation - Evaluate DPR-based Ensemble

# Evaluate DPR-based Ensemble on both QA and triple tasks at k=1, 5, 10
def ensemble_evaluate_dpr_k(ctx_encoder, question_encoder, val_loader, candidates, k_values=[1, 5, 10], top_k: int = 30, task: str = "qa"):
    print(f"Evaluating DPR-based ensemble for {task}...")
    ctx_encoder.eval()
    question_encoder.eval()
    mrr_k, precision_k = {k: [] for k in k_values}, {k: [] for k in k_values}
    candidate_inputs = ctx_tokenizer(candidates, return_tensors="pt", padding=True, truncation=True, max_length=CONFIG.MAX_LENGTH)
    candidate_inputs = {k: v.to(CONFIG.DEVICE) for k, v in candidate_inputs.items()}
    with torch.no_grad():
        candidate_embeddings = ctx_encoder(**candidate_inputs).pooler_output
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Ensemble Evaluating {task}"):
            question_inputs = {
                "input_ids": batch["dpr_input_ids"].to(CONFIG.DEVICE),
                "attention_mask": batch["dpr_attention_mask"].to(CONFIG.DEVICE)
            }
            references = batch["answer"]
            question_embeddings = question_encoder(**question_inputs).pooler_output
            similarities = torch.matmul(question_embeddings, candidate_embeddings.T)
            rankings = torch.argsort(similarities, dim=1, descending=True)
            top_k_indices = rankings[:, :top_k]  # Shape: (batch_size, top_k)
            batch_size = top_k_indices.size(0)
            for i in range(batch_size):
                top_k_candidate_indices = top_k_indices[i].cpu().numpy()
                top_k_candidates = [candidates[idx] for idx in top_k_candidate_indices]
                ref = references[i]
                ref_idx = top_k_candidates.index(ref) if ref in top_k_candidates else -1
                if ref_idx == -1:
                    continue
                for k in k_values:
                    top_k_subset = top_k_indices[i, :k]
                    # Use torch.where to find the rank safely
                    matches = torch.where(top_k_subset == top_k_candidate_indices[ref_idx])[0]
                    if matches.numel() > 0:
                        rank = matches[0].item() + 1
                    else:
                        rank = len(top_k_candidates)
                    mrr_k[k].append(1.0 / rank)
                    precision_k[k].append(1.0 if rank == 1 else 0.0)
            del question_inputs, similarities, rankings, top_k_indices
            torch.cuda.empty_cache()
    results = {}
    for k in k_values:
        avg_mrr = np.mean(mrr_k[k]) if mrr_k[k] else 0.0
        avg_precision = np.mean(precision_k[k]) if precision_k[k] else 0.0
        results[f"mrr_at_{k}"] = avg_mrr
        results[f"precision_at_{k}"] = avg_precision
        print(f"DPR-based Ensemble Evaluation ({task}) at k={k}:")
        print(f"MRR@{k}: {avg_mrr:.4f}")
        print(f"Precision@{k}: {avg_precision:.4f}")
    return results

# Evaluate ensemble on QA and triple tasks
ensemble_qa_metrics = ensemble_evaluate_dpr_k(ctx_encoder_qa, question_encoder_qa, qa_val_loader, all_candidates, task="qa")
ensemble_triple_metrics = ensemble_evaluate_dpr_k(ctx_encoder_triple, question_encoder_triple, triple_val_loader, all_candidates, task="triple")

Evaluating DPR-based ensemble for qa...


Ensemble Evaluating qa: 100%|██████████| 13/13 [00:01<00:00, 10.35it/s]


DPR-based Ensemble Evaluation (qa) at k=1:
MRR@1: 0.4225
Precision@1: 0.4026
DPR-based Ensemble Evaluation (qa) at k=5:
MRR@5: 0.5171
Precision@5: 0.4026
DPR-based Ensemble Evaluation (qa) at k=10:
MRR@10: 0.5262
Precision@10: 0.4026
Evaluating DPR-based ensemble for triple...


Ensemble Evaluating triple: 100%|██████████| 125/125 [00:04<00:00, 25.95it/s]

DPR-based Ensemble Evaluation (triple) at k=1:
MRR@1: 0.4845
Precision@1: 0.4668
DPR-based Ensemble Evaluation (triple) at k=5:
MRR@5: 0.5827
Precision@5: 0.4668
DPR-based Ensemble Evaluation (triple) at k=10:
MRR@10: 0.5919
Precision@10: 0.4668





In [None]:
# (Part 4): Quantitative Validation - Save Results

# Save quantitative results
quantitative_results = {
    "bart_qa": {"bleu": bart_qa_bleu, "rouge": bart_qa_rouge, "bertscore": bart_qa_bert},
    "bart_triple": {"bleu": bart_triple_bleu, "rouge": bart_triple_rouge, "bertscore": bart_triple_bert},
    "dpr_qa": dpr_qa_metrics,
    "dpr_triple": dpr_triple_metrics,
    "ensemble_qa": ensemble_qa_metrics,
    "ensemble_triple": ensemble_triple_metrics
}

quantitative_path = os.path.join(CONFIG.BASE_PATH, "quantitative_results_v3.json")
with open(quantitative_path, "w") as f:
    json.dump(quantitative_results, f)
print(f"Saved quantitative results at {quantitative_path}")

Saved quantitative results at /content/drive/MyDrive/LJMU-Datasets/quantitative_results_v3.json


In [None]:
# Cell 7: Qualitative Analysis (Human Assessment)

# Select samples for qualitative analysis (5 QA, 5 triple)
qa_samples = qa_val_df.sample(n=5, random_state=42)
triple_samples = triple_val_df.sample(n=5, random_state=42)

# Generate predictions for qualitative analysis
def generate_predictions(model, val_loader, task: str = "qa"):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in val_loader:
            print(f"Batch keys: {batch.keys()}")
            input_ids = batch["bart_input_ids"].to(CONFIG.DEVICE)
            attention_mask = batch["bart_attention_mask"].to(CONFIG.DEVICE)
            references = batch["answer"]
            questions = batch["question"]
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=100,
                num_beams=20,
                temperature=0.5,
                no_repeat_ngram_size=2
            )
            generated_texts = [bart_tokenizer.decode(g_ids, skip_special_tokens=True).lower().strip() for g_ids in generated_ids]
            for q, gen, ref in zip(questions, generated_texts, references):
                predictions.append({"question": q, "generated": gen, "reference": ref})
            del input_ids, attention_mask, generated_ids
            torch.cuda.empty_cache()
    return predictions

# Create DataLoaders with adjusted batch size
qa_sample_loader = DataLoader(
    RetrievalDataset(qa_samples, bart_tokenizer, question_tokenizer, task="qa", candidate_objects=all_candidates),
    batch_size=1,
    shuffle=False,
    num_workers=CONFIG.NUM_WORKERS
)
triple_sample_loader = DataLoader(
    RetrievalDataset(triple_samples, bart_tokenizer, question_tokenizer, task="triple", candidate_objects=all_candidates),
    batch_size=1,
    shuffle=False,
    num_workers=CONFIG.NUM_WORKERS
)

# Generate predictions for QA and triple samples
qa_predictions = generate_predictions(bart_qa_model, qa_sample_loader, task="qa")
triple_predictions = generate_predictions(bart_triple_model, triple_sample_loader, task="triple")

# Perform qualitative analysis
qualitative_analysis = {
    "qa": [],
    "triple": []
}

# Assess QA predictions
for pred in qa_predictions:
    question = pred["question"]
    generated = pred["generated"]
    reference = pred["reference"]
    coherence = "Coherent" if len(generated.split()) > 3 and generated != reference else "Incoherent"
    reliability = "Reliable" if generated == reference else "Unreliable"
    interpretability = "Interpretable" if len(generated.split()) > 3 else "Not Interpretable"
    qualitative_analysis["qa"].append({
        "question": question,
        "generated": generated,
        "reference": reference,
        "coherence": coherence,
        "reliability": reliability,
        "interpretability": interpretability
    })

# Assess triple predictions
for pred in triple_predictions:
    question = pred["question"]
    generated = pred["generated"]
    reference = pred["reference"]
    coherence = "Coherent" if len(generated.split()) > 1 and generated != reference else "Incoherent"
    reliability = "Reliable" if generated == reference else "Unreliable"
    interpretability = "Interpretable" if len(generated.split()) > 1 else "Not Interpretable"
    qualitative_analysis["triple"].append({
        "question": question,
        "generated": generated,
        "reference": reference,
        "coherence": coherence,
        "reliability": reliability,
        "interpretability": interpretability
    })

# Save qualitative analysis
qualitative_path = os.path.join(CONFIG.BASE_PATH, "qualitative_analysis_v3.json")
with open(qualitative_path, "w") as f:
    json.dump(qualitative_analysis, f)
print(f"Saved qualitative analysis at {qualitative_path}")

# Print sample qualitative analysis
print("Sample Qualitative Analysis (QA):")
for entry in qualitative_analysis["qa"][:2]:
    print(f"Question: {entry['question']}")
    print(f"Generated: {entry['generated']}")
    print(f"Reference: {entry['reference']}")
    print(f"Coherence: {entry['coherence']}")
    print(f"Reliability: {entry['reliability']}")
    print(f"Interpretability: {entry['interpretability']}\n")

print("Sample Qualitative Analysis (Triple):")
for entry in qualitative_analysis["triple"][:2]:
    print(f"Question: {entry['question']}")
    print(f"Generated: {entry['generated']}")
    print(f"Reference: {entry['reference']}")
    print(f"Coherence: {entry['coherence']}")
    print(f"Reliability: {entry['reliability']}")
    print(f"Interpretability: {entry['interpretability']}\n")

Batch keys: dict_keys(['task', 'bart_input_ids', 'bart_attention_mask', 'bart_labels', 'dpr_input_ids', 'dpr_attention_mask', 'question', 'answer'])
Batch keys: dict_keys(['task', 'bart_input_ids', 'bart_attention_mask', 'bart_labels', 'dpr_input_ids', 'dpr_attention_mask', 'question', 'answer'])
Batch keys: dict_keys(['task', 'bart_input_ids', 'bart_attention_mask', 'bart_labels', 'dpr_input_ids', 'dpr_attention_mask', 'question', 'answer'])
Batch keys: dict_keys(['task', 'bart_input_ids', 'bart_attention_mask', 'bart_labels', 'dpr_input_ids', 'dpr_attention_mask', 'question', 'answer'])
Batch keys: dict_keys(['task', 'bart_input_ids', 'bart_attention_mask', 'bart_labels', 'dpr_input_ids', 'dpr_attention_mask', 'question', 'answer'])
Batch keys: dict_keys(['task', 'bart_input_ids', 'bart_attention_mask', 'bart_labels', 'dpr_input_ids', 'dpr_attention_mask', 'question', 'answer', 'label_idx'])
Batch keys: dict_keys(['task', 'bart_input_ids', 'bart_attention_mask', 'bart_labels', 'dpr_i