# Extractive Question Answering using 🤗 Transformers

From: https://huggingface.co/transformers/task_summary.html#extractive-question-answering

In [None]:
from transformers import pipeline

In [None]:
nlp = pipeline("question-answering")
context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/question-answering/run_squad.py script.
"""

In [None]:
result = nlp(question="What is extractive question answering?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
result = nlp(question="What is a good example of a question answering dataset?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True)
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""
questions = [
    "How many pretrained models are available in 🤗 Transformers?",
    "What does 🤗 Transformers provide?",
    "🤗 Transformers provides interoperability between which frameworks?",
]

def get_answer(question, text, tokenizer, model):
    inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]
    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    model_output = model(**inputs)
    answer_start_scores = model_output.start_logits
    answer_end_scores = model_output.end_logits
    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer


for question in questions:
    answer = get_answer(question, text, tokenizer, model)
    print(f"Question: {question}")
    print(f"Answer: {answer}")

In [None]:
text = r"""
But that doesn't make it a good fit for every data processing need. 
And when you outgrow Excel, a really good option for a next step is Python and 
the data science tech stack: Pandas, Jupyter, and friends.
"""
questions = ["What is the data science tech stack?",
             "What is good option for a next step when you outgrow Excel?"]

for question in questions:
    answer = get_answer(question, text, tokenizer, model)
    print(f"Question: {question}")
    print(f"Answer: {answer}")

In [None]:
text = r"""
Note this is only works around the way this bug crashes NumPy 
(technically, in OpenBLAS which is shipped with NumPy), 
and may not fix all your problems related to this bug, 
Microsoft’s help is needed to do that.
"""
questions = ["What is shipped with NumPy?",
             "Whose help is needed to fix all your problems?"]

for question in questions:
    answer = get_answer(question, text, tokenizer, model)
    print(f"Question: {question}")
    print(f"Answer: {answer}")