In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import pandas as pd
import faiss
import os
from tqdm import tqdm
import textwrap

# Load test data
test_data = load_dataset("neural-bridge/rag-dataset-12000", split="test")
test_contexts = test_data["context"]
chunk_size = 300  # Approx characters per chunk (or use token-based)

# Load model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Storage
os.makedirs("chunks_test_faiss_store", exist_ok=True)
all_chunks = []
chunk_text_to_original = []

# Chunk all contexts
print("Chunking and encoding contexts...")
for original_context in tqdm(test_contexts):
    if not isinstance(original_context, str):
        continue
    # Simple char-based chunking (or use token-based for better accuracy)
    chunks = textwrap.wrap(original_context, width=chunk_size, break_long_words=False)
    all_chunks.extend(chunks)
    chunk_text_to_original.extend([original_context] * len(chunks))

# Encode all chunks
batch_size = 32
context_embeddings = []

for i in tqdm(range(0, len(all_chunks), batch_size)):
    batch = all_chunks[i:i + batch_size]
    emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    context_embeddings.append(emb)

context_embeddings = np.vstack(context_embeddings)

# Build and save FAISS index
dimension = context_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(context_embeddings)
faiss.write_index(index, "chunks_test_faiss_store/context_index.faiss")

# Save chunk mapping
pd.DataFrame({
    "chunk": all_chunks,
    "original_context": chunk_text_to_original
}).to_csv("chunks_test_faiss_store/context_mapping.csv", index=False)

print(" chunked FAISS index and mapping saved to 'chunks_test_faiss_store/'")

  from .autonotebook import tqdm as notebook_tqdm


Chunking and encoding contexts...


100%|██████████| 2400/2400 [00:01<00:00, 1786.28it/s]
100%|██████████| 902/902 [00:43<00:00, 20.80it/s]


 chunked FAISS index and mapping saved to 'chunks_test_faiss_store/'


In [2]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import faiss
from tqdm import tqdm
import torch
from sentence_transformers import SentenceTransformer

# Load fine-tuned or base model
model = SentenceTransformer("/Users/likhit/Desktop/Projects/RAG/1fineeeeminilm_proj512_only_dense")

# Load test dataset
test_data = load_dataset("neural-bridge/rag-dataset-12000", split="test")
test_questions = test_data["question"]
test_contexts = test_data["context"]

# Filter invalid entries
filtered = [(q, c) for q, c in zip(test_questions, test_contexts)
            if isinstance(q, str) and q.strip() and isinstance(c, str) and c.strip()]
test_questions, test_contexts = zip(*filtered)

# Load FAISS index and chunk-to-original mapping
index = faiss.read_index("/Users/likhit/Desktop/Projects/RAG/model/retriever/testmodel/chunks_test_faiss_store/context_index.faiss")
context_df = pd.read_csv("/Users/likhit/Desktop/Projects/RAG/model/retriever/testmodel/chunks_test_faiss_store/context_mapping.csv")

# context_mapping.csv must have at least two columns: "chunk", "original_context"
if "original_context" not in context_df.columns:
    raise ValueError("context_mapping.csv must include a column named 'original_context'.")

# Compute evaluation metrics
def compute_metrics(k=5):
    correct = 0
    precision_scores = []
    recall_scores = []
    total = len(test_questions)

    for question, true_context in tqdm(zip(test_questions, test_contexts), total=total):
        query_vec = model.encode([question], convert_to_numpy=True)
        query_vec = np.expand_dims(query_vec, axis=0) if query_vec.ndim == 1 else query_vec

        _, indices = index.search(query_vec, k)
        retrieved_originals = [
            context_df.iloc[i]["original_context"].strip() for i in indices[0]
        ]

        match = [1 if true_context.strip() == r else 0 for r in retrieved_originals]

        precision = sum(match) / k
        recall = 1.0 if any(match) else 0.0
        accuracy = 1 if any(match) else 0

        precision_scores.append(precision)
        recall_scores.append(recall)
        correct += accuracy

    return {
        f"Precision@{k}": round(np.mean(precision_scores), 4),
        f"Recall@{k}": round(np.mean(recall_scores), 4),
        f"Accuracy@{k}": round(correct / total, 4)
    }

# Run evaluation
metrics = compute_metrics(k=3)
print("📊 Evaluation Metrics:", metrics)

100%|██████████| 2399/2399 [00:36<00:00, 66.23it/s]

📊 Evaluation Metrics: {'Precision@3': 0.7648, 'Recall@3': 0.9387, 'Accuracy@3': 0.9387}





In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/Users/likhit/Desktop/Projects/RAG/lora_finetune_retriever")

from transformers import AutoModel
from peft import get_peft_model, LoraConfig, PeftModel, TaskType
import torch

# Use same config as training
lora_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none"
)

# Load base model
base_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Apply LoRA
model = get_peft_model(base_model, lora_config)

# Load LoRA weights
model.load_adapter("/Users/likhit/Desktop/Projects/RAG/lora_finetune_retriever" , "LORA")

# Set to eval mode
model.eval()

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 384, padding_idx=0)
        (position_embeddings): Embedding(512, 384)
        (token_type_embeddings): Embedding(2, 384)
        (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-5): 6 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=384, out_features=384, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                    (LORA): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=384, out_f

In [10]:

device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")
model.to(device)

def compute_metrics2(k=5):
    correct = 0
    precision_scores = []
    recall_scores = []
    total = len(test_questions)

    for question, true_context in tqdm(zip(test_questions, test_contexts), total=total):
        # Encode with LoRA model
        with torch.no_grad():
            tokens = tokenizer(question, return_tensors="pt", padding=True, truncation=True).to(device)
            q_emb = model(**tokens).last_hidden_state[:, 0].cpu().numpy()

        # Ensure 2D shape for FAISS
        q_emb = np.expand_dims(q_emb, axis=0) if q_emb.ndim == 1 else q_emb

        _, indices = index.search(q_emb, k)

        retrieved = [
            context_df.iloc[i]["original_context"].strip() for i in indices[0]
        ]

        match = [1 if true_context.strip() == r else 0 for r in retrieved]

        precision = sum(match) / k
        recall = 1.0 if any(match) else 0.0
        accuracy = 1 if any(match) else 0

        precision_scores.append(precision)
        recall_scores.append(recall)
        correct += accuracy

    return {
        f"Precision@{k}": round(np.mean(precision_scores), 4),
        f"Recall@{k}": round(np.mean(recall_scores), 4),
        f"Accuracy@{k}": round(correct / total, 4)
    }

# Run evaluation
metrics = compute_metrics2(k=5)
print("📊 Evaluation Metrics:", metrics)

✅ Using device: mps


100%|██████████| 2399/2399 [00:32<00:00, 73.59it/s]

📊 Evaluation Metrics: {'Precision@5': 0.5542, 'Recall@5': 0.9033, 'Accuracy@5': 0.9033}



