In [None]:
!pip install torch
!pip install transformers wikipedia-api


In [None]:
import wikipediaapi
from transformers import BertForQuestionAnswering, BertTokenizer
import torch


# Load pre-trained BERT model and tokenizer
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)


def fetch_wikipedia_text(title):
    try:
        wiki = wikipediaapi.Wikipedia(
            user_agent='42 Wikipedia Scraper/1.0',  # User agent must be specified
            language='en',
            extract_format=wikipediaapi.ExtractFormat.WIKI
        )
        page = wiki.page(title)
        if page.exists():
            return page.text
        else:
            print(f"Error: Wikipedia page '{title}' not found.")
            return None
    except Exception as e:
        print(f"Error fetching Wikipedia page: {e}")
        return None

In [66]:

def answer_question(question, text):
    # Tokenize input question and text
    inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt", truncation=True)

    # Perform question-answering
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract start and end scores from outputs
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Find the tokens with the highest start and end scores
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1  # Add 1 to include the last token

    # Convert token IDs to actual tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())

    # Combine tokens to form the answer
    answer = tokenizer.convert_tokens_to_string(tokens[answer_start:answer_end])

    # Post-process answer (remove special tokens and additional spaces)
    answer = answer.replace("[CLS]", "").replace("[SEP]", "").strip()
    return answer

In [69]:
# Fetch Wikipedia text (replace 'Chatbot' with your desired topic)
article_title = "Phrases from The Hitchhiker's Guide to the Galaxy"
article_text = fetch_wikipedia_text(article_title)

# Example usage
question = "What is the Answer to the Ultimate Question of Life, the Universe, and Everything?"

answer = answer_question(question, article_text)
print(">===============================<")
print("Question:", question)
print("Answer:", answer)
print(">===============================<")

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Question: What is the Answer to the Ultimate Question of Life, the Universe, and Everything?
Answer: 42
