In [1]:
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

def expand_query(query):
    expanded_terms = set(query.split())
    for word in query.split():
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                expanded_terms.add(lemma.name())
    return ' '.join(expanded_terms)

query = "data science"
expanded_query = expand_query(query)
print("Expanded Query:", expanded_query)


[nltk_data] Downloading package wordnet to /home/ubuntu/nltk_data...


Expanded Query: skill information scientific_discipline science datum data_point data


In [4]:
from rank_bm25 import BM25Okapi
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoder, DPRContextEncoderTokenizer
import faiss
import numpy as np

# Sample documents
documents = ["Data science is a field that uses scientific methods", 
             "Machine learning is a subfield of artificial intelligence", 
             "Statistics is a branch of mathematics"]

# BM25 retrieval
tokenized_docs = [doc.split() for doc in documents]
bm25 = BM25Okapi(tokenized_docs)
bm25_scores = bm25.get_scores(expanded_query.split())

# DPR retrieval
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

# Encode query
query_embedding = question_encoder(question_tokenizer(expanded_query, return_tensors="pt")['input_ids']).pooler_output.detach().numpy()

# Encode documents
context_embeddings = []
for doc in documents:
    inputs = context_tokenizer(doc, return_tensors="pt")['input_ids']
    outputs = context_encoder(inputs).pooler_output.detach().numpy()
    context_embeddings.append(outputs)
context_embeddings = np.vstack(context_embeddings)

# Create FAISS index
index = faiss.IndexFlatIP(query_embedding.shape[1])
index.add(context_embeddings)

# Retrieve using FAISS
_, dpr_scores = index.search(query_embedding, len(documents))

# Combine scores
combined_scores = bm25_scores + dpr_scores.flatten()

# Retrieve top documents
top_k = 3
top_indices = np.argsort(combined_scores)[::-1][:top_k]
retrieved_docs = [documents[i] for i in top_indices]

print("Retrieved Documents:", retrieved_docs)


ImportError: 
DPRQuestionEncoder requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
