In [None]:
!pip install -U langchain-community

In [None]:
!pip install wikipedia

In [None]:
!pip install faiss-cpu

In [46]:
# Package imports
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.document_loaders import WikipediaLoader
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from transformers import logging as transformers_logging
from sentence_transformers import SentenceTransformer, util
import torch

# Suppress unnecessary warning messages from Hugging Face
transformers_logging.set_verbosity_error()

# Dynamic Keyword Extraction
def extract_keywords(query):
    """
    Extracts named entities (keywords) from a query using a Named Entity
    Recognition (NER) model.
    Filters entities of type ORGANIZATION (ORG), LOCATION (LOC), PERSON (PER),
    or DATE (DATE).
    """
    # Use a pre-trained BERT-based NER model for named entity recognition
    keyword_extractor = pipeline("ner", model="dslim/bert-base-NER")

    # Extract named entities from the query
    entities = keyword_extractor(query)

    # Filter and return entities matching the specified types
    return [entity['word'] for entity in entities if entity['entity'] in {"ORG", "LOC", "PER", "DATE"}]

# Refined Retrieval Process
def retrieve_relevant_context(vector_store, query, num_docs=5):
    """
    Retrieves relevant documents from the FAISS vector store based on the query
    and extracted keywords.
    Combines the query with extracted keywords to improve relevance during retrieval.
    """
    # Extract keywords from the query
    keywords = extract_keywords(query)

    # Retrieve documents from the vector store
    retriever = vector_store.as_retriever(search_kwargs={"k": num_docs})

    # Combine query and keywords to improve retrieval
    docs = retriever.get_relevant_documents(query + " " + " ".join(keywords))

    # Return the concatenated page content of retrieved documents
    return " ".join([doc.page_content for doc in docs])

# Chunking with Relevance Filtering
def chunk_and_score_context(context, query, max_length=512):
    """
    Splits the retrieved context into smaller chunks and scores each chunk's
    relevance to the query.
    Uses SentenceTransformer embeddings and cosine similarity for scoring.
    """
    # Load a pre-trained model for computing embeddings and similarity
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Split the context into chunks of max_length
    chunks = [context[i:i + max_length] for i in range(0, len(context), max_length)]

    # Compute the query embedding
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity scores for each chunk
    scored_chunks = [
        (chunk,
         util.cos_sim(query_embedding,
                      model.encode(chunk, convert_to_tensor=True))[0].item())
        for chunk in chunks
    ]

    # Return the top 3 chunks sorted by relevance in descending order
    return sorted(scored_chunks, key=lambda x: x[1], reverse=True)[:3]

# Handle Ambiguity with Confidence and Evidence
def answer_query_with_sources(query, model, retriever):
    """
    Answers a query using the retrieved context, providing the answer,
    confidence score, and relevant sources.
    Combines top-scoring context chunks to generate the final answer.
    """
    # Retrieve relevant context for the query
    context = retrieve_relevant_context(retriever, query)

    # Select the most relevant context chunks
    top_chunks = chunk_and_score_context(context, query)

    # Combine the top chunks into a single context
    combined_context = " ".join([chunk[0] for chunk in top_chunks])

    # Use the question-answering model to generate the answer
    result = model(question=query, context=combined_context)

    # Return the answer, confidence score, and top sources
    return {
        "answer": result['answer'],
        "confidence": result['score'],
        "sources": top_chunks
    }

# Initialize Components
def initialize_model_and_retriever(query):
    """
    Initializes the question-answering model, retrieves Wikipedia documents,
    and creates a FAISS vector store for retrieval.
    """
    # Load a pre-trained question-answering model and tokenizer
    model_id = "deepset/bert-large-uncased-whole-word-masking-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForQuestionAnswering.from_pretrained(model_id)

    # Set up a question-answering pipeline
    qa_pipeline = pipeline("question-answering", model=model,
                           tokenizer=tokenizer, device=0)

    # Load Wikipedia documents relevant to the query
    loader = WikipediaLoader(query=query, lang="en")
    try:
        documents = loader.load()
    except Exception as e:
        print(f"[ERROR] WikipediaLoader failed with error: {e}")
        documents = []

    if not documents:
        print("[WARNING] No documents retrieved from Wikipedia.")
        return qa_pipeline, None

    # Create a FAISS vector store using document embeddings
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(documents, embeddings)

    return qa_pipeline, vector_store

# Main Function
def main():
    """
    Main function to execute the question-answering pipeline.
    Prompts for a query, retrieves relevant context, and provides an
    answer with confidence and sources.
    """
    # MODEL QUERY ####################################
    query = "When did the Battle of Waterloo occur?"
    print(f"[INFO] Searching Wikipedia for: {query}")
    ##################################################

    # Initialize the model and retriever with the query
    qa_pipeline, vector_store = initialize_model_and_retriever(query)

    if vector_store is None:
        print("[ERROR] Retrieval failed. Unable to proceed with the question-answering task.")
        return

    # Get results
    print("[INFO] Running question-answering...")
    result = answer_query_with_sources(query, qa_pipeline, vector_store)

    # Display results
    print(f"[INFO] Answer: {result['answer']}")
    print(f"[INFO] Confidence: {result['confidence']}")
    print(f"[INFO] Sources:")
    for source in result['sources']:
        print(source)

# Run if script is main
if __name__ == "__main__":
    main()

[INFO] Searching Wikipedia for: When did the Battle of Waterloo occur?
[INFO] Running question-answering...
[INFO] Answer: Sunday 18 June 1815
[INFO] Confidence: 0.6014962196350098
[INFO] Sources:
('piness, when she once more embraced the married state it was to marry the new landlord tavern; from which time it obtained the title it now bears.\n\n\n== See also ==\nList of Waterloo Battlefield locations\n\n\n== Notes ==\n\n\n== References ==\nGillespie-Payne, Jonathan (2003), Waterloo: In the Footsteps of the Commanders, Pen and Sword, p. 166, ISBN 978-1-4738-2060-9\nRomberg, J. B. (1820), New picture of Brussels, p. 185 The Battle of Waterloo was fought on Sunday 18 June 1815, near Waterloo (at that time in the Un', 0.6367213726043701)
("metres (9.3 mi) south of Brussels, and about 2 kilometres (1.2 mi) from the town of Waterloo. The site of the battlefield today is dominated by the monument of the Lion's Mound, a large artificial hill constructed from earth taken from the battlefield 