In [None]:
import wikipedia
import numpy as np
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import sys

def get_wikipedia_content(topic):
    """
    Fetches the full Wikipedia content for a given topic.

    Args:
        topic (str): The title of the Wikipedia page to fetch.

    Returns:
        str or None: The full text content of the page if found, else None.
    """
    try:
        page = wikipedia.page(topic)
        return page.content
    except wikipedia.exceptions.PageError:
        print("Page not found. Please check the topic name.")
        return None
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Ambiguous topic. Please be more specific. Options: {e.options}")
        return None
    except Exception as e:
        print(f"Unexpected error occurred: {e}")
        return None

def split_text(text, tokenizer, chunk_size=256, chunk_overlap=20):
    """
    Splits long text into overlapping token chunks for processing.

    Args:
        text (str): The full input text to be chunked.
        tokenizer (PreTrainedTokenizer): Tokenizer used for splitting.
        chunk_size (int): Maximum number of tokens per chunk.
        chunk_overlap (int): Number of tokens to overlap between consecutive chunks.

    Returns:
        List[str]: A list of chunked text segments.
    """
    tokens = tokenizer.tokenize(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunk = tokenizer.convert_tokens_to_string(tokens[start:end])
        chunks.append(chunk)
        if end == len(tokens):
            break
        start = end - chunk_overlap  # Create overlap
    return chunks

def embed_chunks(chunks, model):
    """
    Encodes a list of text chunks into vector embeddings.

    Args:
        chunks (List[str]): List of text chunks.
        model (SentenceTransformer): Embedding model.

    Returns:
        np.ndarray: Array of vector embeddings.
    """
    return model.encode(chunks, convert_to_numpy=True)

def build_faiss_index(embeddings):
    """
    Builds a FAISS vector index for similarity search.

    Args:
        embeddings (np.ndarray): Embeddings matrix.

    Returns:
        faiss.IndexFlatL2: FAISS index for fast nearest neighbor search.
    """
    dimension = embeddings.shape[1]  # Dimension of embedding vectors
    index = faiss.IndexFlatL2(dimension)  # L2 distance (Euclidean)
    index.add(embeddings)  # Add embeddings to the index
    return index

def retrieve_top_k_chunks(query, chunks, embedding_model, index, k=3):
    """
    Retrieves the top-k most relevant text chunks for a given query.

    Args:
        query (str): User's question.
        chunks (List[str]): Original text chunks.
        embedding_model (SentenceTransformer): Embedding model for the query.
        index (faiss.Index): FAISS index built from document chunks.
        k (int): Number of top similar chunks to retrieve.

    Returns:
        List[str]: List of the most relevant text chunks.
    """
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(np.array(query_embedding), k)
    return [chunks[i] for i in indices[0]]

def answer_question(question, context, qa_pipeline):
    """
    Answers a question using the QA pipeline and the given context.

    Args:
        question (str): The question to answer.
        context (str): Relevant context to extract answer from.
        qa_pipeline (Pipeline): Hugging Face QA pipeline.

    Returns:
        str: Extracted answer.
    """
    result = qa_pipeline(question=question, context=context)
    return result['answer']

def main():
    # Step 1: Get topic input from user
    topic = input("Enter a topic to learn about: ")
    document = get_wikipedia_content(topic)

    # Exit if the topic is invalid
    if not document:
        print("Failed to retrieve content. Exiting.")
        sys.exit(1)

    # Step 2: Load tokenizer and embedding model for chunking and encoding
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
    embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

    # Step 3: Split the document into overlapping chunks
    chunks = split_text(document, tokenizer)
    print(f"[INFO] Text split into {len(chunks)} chunks.")

    # Step 4: Embed the chunks into dense vectors
    embeddings = embed_chunks(chunks, embedding_model)

    # Step 5: Create a FAISS index for similarity search
    index = build_faiss_index(embeddings)

    # Step 6: Ask the user for a question about the topic
    query = input("Ask a question about the topic: ")

    # Step 7: Retrieve top-k relevant text chunks based on the query
    retrieved_chunks = retrieve_top_k_chunks(query, chunks, embedding_model, index, k=3)
    context = " ".join(retrieved_chunks)

    # Step 8: Initialize QA model and tokenizer pipeline
    qa_model_name = "deepset/roberta-base-squad2"
    qa_pipeline_model = pipeline(
        "question-answering",
        model=AutoModelForQuestionAnswering.from_pretrained(qa_model_name),
        tokenizer=AutoTokenizer.from_pretrained(qa_model_name)
    )

    # Step 9: Use the pipeline to get the final answer from context
    answer = answer_question(query, context, qa_pipeline_model)
    print(f"\nAnswer: {answer}")

if __name__ == "__main__":
    main()