[![Open in Colab](https://colab.research.google.com/assets//colab-badge.svg)](https://colab.research.google.com/github/Joykw1/NLP_RAG_project/blob/main/Code/Single_Passage_Retrieval_RAG.ipynb)

In [None]:
# Downloads

!pip install datasets rank_bm25 bitsandbytes

In [None]:
# Imports
from datasets import load_dataset
from rank_bm25 import BM25Okapi
import random
from tqdm import tqdm

In [None]:
# Setup tokenizer. From tutorial 6

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

checkpoint = "Qwen/Qwen2.5-1.5B-Instruct"

# Configure 8-bit quantization. We use this to save VRAM, as we don't have a lot available.
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True  # Enables 8-bit quantization
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    quantization_config=bnb_config,  # Apply BitsAndBytesConfig
    device_map="cuda"   # Assign to GPU
)

In [None]:
# Load the xquad dataset
dataset = load_dataset("xquad", "xquad.en")
dataset = load_dataset("xquad", "xquad.de")
dataset = load_dataset("xquad", "xquad.ru")

# Print some info about the dataset
dataset

In [None]:
# This code is basically all from tutorial 6

contextCol = dataset['validation']['context']
contextCol = list(set(contextCol))
random.shuffle(contextCol) # randomize order of the list

questions = dataset['validation']['question']
answers = dataset['validation']['answers']

correct_count = 0

# Test single passage retrieval with 100 question examples
for i in tqdm(range(len(questions))):
  query = questions[i]
  
  # Tokenize documents for BM25
  tokenized_contexts = [doc.split() for doc in contextCol]
  bm25 = BM25Okapi(tokenized_contexts)

  # Retrieve top_k passages using BM25
  top_k = 2
  tokenized_query = query.split()
  top_docs = bm25.get_top_n(tokenized_query, contextCol, n=top_k)

  # Apply chat template
  context = "\n".join(top_docs)
  chat = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": f"Context: {context}\nQuery: {query}"},
  ]

  prompt = tokenizer.apply_chat_template(chat, tokenize=False)

  if i in [1,50,99]: print("\nPrompt:", prompt)

  # Generate response
  input_ids = tokenizer.encode(prompt, return_tensors='pt').to("cuda")
  output_ids = model.generate(input_ids)
  response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
  if answers[i]['text'][0] in response:
    correct_count += 1
  else: print(f"\nWrong answer, correct answer was: {answers[i]['text'][0]},\nResponse:, {response}")
  if i in [1,50,99]: print("\nResponse:", response)

print()

print(correct_count, " correct out of ", i+1)