In [1]:
!pip install datasets scikit-learn nltk rank_bm25
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, rank_bm25
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-c

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = load_dataset("squad", split="train")
documents = [example['context'] for example in dataset][:1000]  # Use subset for faster processing

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [3]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words and token.isalpha()]

processed_docs = [preprocess(doc) for doc in documents]

In [4]:
# Convert tokenized docs back to strings
text_docs = [' '.join(doc) for doc in processed_docs]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_docs)

def tfidf_retrieve(query, top_k=5):
    processed_query = ' '.join(preprocess(query))
    query_vec = tfidf_vectorizer.transform([processed_query])
    cos_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cos_sim.argsort()[-top_k:][::-1]
    return [documents[i] for i in top_indices]

In [5]:
bm25 = BM25Okapi(processed_docs)

def bm25_retrieve(query, top_k=5):
    tokenized_query = preprocess(query)
    scores = bm25.get_scores(tokenized_query)
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return [documents[i] for i in top_indices]

In [6]:
query = "Who discovered penicillin?"

print("TF-IDF Results:")
for result in tfidf_retrieve(query):
    print(result[:150] + "...")

print("\nBM25 Results:")
for result in bm25_retrieve(query):
    print(result[:150] + "...")

TF-IDF Results:
Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when da...
Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when da...
Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when da...
Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when da...
Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when da...

BM25 Results:
Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when da...
Beyoncé attended St. Mary's Elementary School

In [7]:
#Integration with LLM
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

def answer_with_llm(query):
    contexts = bm25_retrieve(query)  # Get relevant contexts
    results = []
    for context in contexts:
        result = qa_pipeline(question=query, context=context)
        results.append((result['answer'], result['score']))
    return sorted(results, key=lambda x: x[1], reverse=True)

llm_answers = answer_with_llm(query)
print("\nLLM Answers from Retrieved Contexts:")
for ans, score in llm_answers:
    print(f"Answer: {ans} (Confidence: {score:.2f})")

2025-04-26 13:03:32.884583: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745672613.126947      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745672613.192385      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cuda:0



LLM Answers from Retrieved Contexts:
Answer: Darlette Johnson (Confidence: 0.26)
Answer: Darlette Johnson (Confidence: 0.26)
Answer: Darlette Johnson (Confidence: 0.26)
Answer: Darlette Johnson (Confidence: 0.26)
Answer: Darlette Johnson (Confidence: 0.26)
