In [None]:
!pip install torch
!pip install transformers wikipedia-api

In [82]:
import wikipediaapi
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

def fetch_wikipedia_text(title):
    try:
        wiki = wikipediaapi.Wikipedia(
            user_agent='42 Wikipedia Scraper/1.0',
            language='en',
            extract_format=wikipediaapi.ExtractFormat.WIKI
        )
        page = wiki.page(title)
        if page.exists():
            return page.text
        else:
            print(f"Error: Wikipedia page '{title}' not found.")
            return None
    except Exception as e:
        print(f"Error fetching Wikipedia page: {e}")
        return None

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [84]:
def answer_question(question, text):
    inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    answer = tokenizer.convert_tokens_to_string(tokens[answer_start:answer_end])
    answer = answer.replace("[CLS]", "").replace("[SEP]", "").strip()
    return answer

In [88]:
train_article_title = "Phrases from The Hitchhiker's Guide to the Galaxy"
article_text = fetch_wikipedia_text(train_article_title)

question = "waht is The Answer to the Ultimate Question of Life, the Universe, and Everything?"
answer = answer_question(question, article_text)


print("---------------------------------------------------")
print("Question:", question)
print("Answer:", answer)
print("---------------------------------------------------")

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


---------------------------------------------------
Question: waht is The Answer to the Ultimate Question of Life, the Universe, and Everything?
Answer: 42
---------------------------------------------------
