In [33]:
pip install transformers datasets evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [34]:
pip install torch tqdm scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
from datasets import load_dataset

data_files = {
    "train": "train-v2.0.json",
    "validation": "dev-v2.0.json"
}

dataset = load_dataset("json", data_files=data_files, field="data")


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [10]:
def flatten_squad(dataset_split):
    context_list = []
    question_list = []
    answer_list = []
    id_list = []

    for article in dataset_split:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                qid = qa['id']

                if qa['is_impossible']:
                    answer = {"text": "", "answer_start": 0}
                else:
                    answer = qa['answers'][0]  # Take first answer

                context_list.append(context)
                question_list.append(question)
                answer_list.append(answer)
                id_list.append(qid)

    return {
        "context": context_list,
        "question": question_list,
        "answers": answer_list,
        "id": id_list
    }

train_data = flatten_squad(dataset["train"])
val_data = flatten_squad(dataset["validation"])


In [11]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
def preprocess_data(contexts, questions, answers):
    inputs = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = inputs["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = inputs.sequence_ids(i)
        sample_idx = sample_mapping[i]  # this is the index to the original example
        answer = answers[sample_idx]
        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])

        # Find start and end token indices
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1

        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1

        # Handle no-answer case
        if answer["text"] == "":
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                for idx in range(token_start_index, token_end_index + 1):
                    if offsets[idx][0] <= start_char and offsets[idx][1] >= start_char:
                        start_pos = idx
                    if offsets[idx][0] <= end_char and offsets[idx][1] >= end_char:
                        end_pos = idx
                start_positions.append(start_pos)
                end_positions.append(end_pos)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [13]:
train_encodings = preprocess_data(train_data["context"], train_data["question"], train_data["answers"])
val_encodings = preprocess_data(val_data["context"], val_data["question"], val_data["answers"])


In [14]:
import torch

class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = QADataset(train_encodings)
val_dataset = QADataset(val_encodings)


In [15]:
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert-qa-checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=100
)


In [21]:
import evaluate
import numpy as np

squad_metric = evaluate.load("squad_v2")

def compute_metrics(pred):
    return squad_metric.compute(
        predictions=pred.predictions,
        references=pred.label_ids
    )


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)


In [20]:
trainer.train()


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.9256,1.004801
2,0.5668,1.214311


TrainOutput(global_step=32940, training_loss=0.9286204977330832, metrics={'train_runtime': 21299.2532, 'train_samples_per_second': 12.372, 'train_steps_per_second': 1.547, 'total_flos': 5.164033933049242e+16, 'train_loss': 0.9286204977330832, 'epoch': 2.0})

In [1]:
from transformers import BertForQuestionAnswering, BertTokenizerFast, pipeline

model_path = "./bert-qa-checkpoints/checkpoint-32940"

model = BertForQuestionAnswering.from_pretrained(model_path)
tokenizer = BertTokenizerFast.from_pretrained(model_path)


In [4]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./tmp-eval",    # Can be any temp folder
    per_device_eval_batch_size=8
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer
)


In [23]:
import collections
from tqdm import tqdm
import numpy as np

# Get raw predictions (start_logits, end_logits)
raw_predictions = trainer.predict(val_dataset)
start_logits, end_logits = raw_predictions.predictions


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [29]:
def postprocess_qa_predictions(
    examples,
    features,
    raw_predictions,
    n_best_size=20,
    max_answer_length=30
):
    all_start_logits, all_end_logits = raw_predictions
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)

    for i, sample_idx in enumerate(features["overflow_to_sample_mapping"]):
        features_per_example[sample_idx].append(i)

    predictions = collections.OrderedDict()

    for example_index, example_id in enumerate(examples["id"]):
        context = examples["context"][example_index]
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features["offset_mapping"][feature_index]
            input_ids = features["input_ids"][feature_index]

            cls_index = input_ids.index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or feature_null_score < min_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1: -n_best_size - 1: -1].tolist()
            end_indexes = np.argsort(end_logits)[-1: -n_best_size - 1: -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append({
                        "score": start_logits[start_index] + end_logits[end_index],
                        "text": context[start_char: end_char]
                    })

        if valid_answers:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": ""}

        if min_null_score is not None and ("score" not in best_answer or min_null_score < best_answer["score"]):
            predictions[example_id] = ""
        else:
            predictions[example_id] = best_answer["text"]


    return predictions


In [30]:
val_encodings = tokenizer(
    val_data["question"],
    val_data["context"],
    truncation="only_second",
    max_length=384,
    stride=128,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding="max_length"
)

val_encodings["overflow_to_sample_mapping"] = val_encodings.pop("overflow_to_sample_mapping")
val_encodings["offset_mapping"] = val_encodings.pop("offset_mapping")
val_encodings["input_ids"] = val_encodings["input_ids"]


In [31]:
predictions = postprocess_qa_predictions(val_data, val_encodings, (start_logits, end_logits))

# Save to predictions.json
import json
with open("predictions.json", "w") as f:
    json.dump(predictions, f)


In [36]:
!python evaluate-v2.0.py dev-v2.0.json predictions.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{
  "exact": 50.07159100480081,
  "f1": 50.07159100480081,
  "total": 11873,
  "HasAns_exact": 0.0,
  "HasAns_f1": 0.0,
  "HasAns_total": 5928,
  "NoAns_exact": 100.0,
  "NoAns_f1": 100.0,
  "NoAns_total": 5945
}


In [47]:
from transformers import BertForQuestionAnswering, BertTokenizerFast, pipeline

model_path = "./bert-qa-checkpoints/checkpoint-32940"

model = BertForQuestionAnswering.from_pretrained(model_path).to("cuda")
tokenizer = BertTokenizerFast.from_pretrained(model_path)

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)


In [48]:
def get_answer(question, context):
    return qa_pipeline({'question': question, 'context': context})['answer']

context = "The Eiffel Tower is located in Paris."
question = "Where is the Eiffel Tower?"

print("Answer:", get_answer(question, context))


Answer: Paris.


In [50]:
question = "Who is Harry Potter?"

print("Answer:", get_answer(question, context))

Answer: Eiffel Tower


In [51]:
context = "Quick, name a book series more popular than Harry Potter. ... Maybe Game of Thrones? Truth is, the popularity of the Potter-verse is pretty much unmatched. But who's surprised? The story has it all: lovable heroes, terrifying villains, frickin' magic... And to top it all off, enough twists, turns and itty-bitty details to rival Hogwarts itself. No wonder, then, that the series is as popular today as it was back when The Philosopher's Stone (that's Sorcerer's Stone for the Americans) first introduced us to mugwumps and muggles more than 20 years ago. We've been obsessed with the wizarding world of Harry P for two decades now, and counting. But how much do you know about Harry & Co? Do you know what inspired J.K. Rowling to make Quidditch? Or which hex Snape invented himself? Or how about the lifespan of a Basilisk? The answers to these questions and more are collected together here, in one handy, exhaustive guide. Look no further, Potter Pal. Here's everything you ever need to know about the Harry Potter Universe. Harry Potter Universe  Facts 1. Try, Try, And Try AgainWhy is it that all the greats are rejected at first? Between Walt Disney, Oprah Winfrey, even freaking Elvis... are we trying to not be entertained? Case in point: J.K. Rowling’s first Harry Potter manuscript was rejected by 12 publishing houses, before being picked up by Bloomsbury. It got so bad for a while, she even tried submitting her magical magnum opus under a pen name:  Robert Galbraith. After that tactic failed, she almost gave up. She even complained to a friend, saying, They don’t even want me in a beard! But look at her now! Those publishers must be feeling more regret than the wizard who tried kissing a Blast-Ended Skrewt."

In [53]:
question = "Who wrote Harry Potter?"

print("Answer:", get_answer(question, context))

Answer: J.K. Rowling’
