In [2]:
!pip install transformers datasets evaluate



In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [11]:
data = load_dataset("squad", split="train")

Found cached dataset squad (/Users/ngjinyuan/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [12]:
data = data.train_test_split(test_size=0.2)

In [13]:
data["train"][0]

{'id': '5729710a6aef051400154ed2',
 'title': 'Zinc',
 'context': 'Several dozen radioisotopes have been characterized. 65Zn, which has a half-life of 243.66 days, is the most long-lived radioisotope, followed by 72Zn with a half-life of 46.5 hours. Zinc has 10 nuclear isomers. 69mZn has the longest half-life, 13.76 h. The superscript m indicates a metastable isotope. The nucleus of a metastable isotope is in an excited state and will return to the ground state by emitting a photon in the form of a gamma ray. 61Zn has three excited states and 73Zn has two. The isotopes 65Zn, 71Zn, 77Zn and 78Zn each have only one excited state.',
 'question': 'How many excited states does 73Zn have?',
 'answers': {'text': ['two'], 'answer_start': [491]}}

In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [15]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    
    return inputs

In [16]:
tokenized_squad = data.map(preprocess_function, batched=True, remove_columns=data["train"].column_names)

Map:   0%|          | 0/70079 [00:00<?, ? examples/s]

Map:   0%|          | 0/17520 [00:00<?, ? examples/s]

In [14]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [13]:
training_args = TrainingArguments(
    output_dir="model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

NameError: name 'TrainingArguments' is not defined

In [None]:
trainer.train()

                                                 
 33%|███▎      | 250/750 [01:45<03:04,  2.72it/s]

{'eval_loss': 2.747321367263794, 'eval_runtime': 7.3285, 'eval_samples_per_second': 136.453, 'eval_steps_per_second': 8.597, 'epoch': 1.0}


 67%|██████▋   | 500/750 [03:17<01:31,  2.73it/s]

{'loss': 2.8973, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


                                                 
 67%|██████▋   | 500/750 [03:27<01:31,  2.73it/s]

{'eval_loss': 1.9689843654632568, 'eval_runtime': 7.738, 'eval_samples_per_second': 129.233, 'eval_steps_per_second': 8.142, 'epoch': 2.0}


                                                 
100%|██████████| 750/750 [05:06<00:00,  2.44it/s]

{'eval_loss': 1.8223029375076294, 'eval_runtime': 7.4578, 'eval_samples_per_second': 134.087, 'eval_steps_per_second': 8.447, 'epoch': 3.0}
{'train_runtime': 306.9964, 'train_samples_per_second': 39.088, 'train_steps_per_second': 2.443, 'train_loss': 2.4586016031901043, 'epoch': 3.0}





TrainOutput(global_step=750, training_loss=2.4586016031901043, metrics={'train_runtime': 306.9964, 'train_samples_per_second': 39.088, 'train_steps_per_second': 2.443, 'train_loss': 2.4586016031901043, 'epoch': 3.0})

In [None]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="model/checkpoint-500/")
question_answerer(question=question, context=context)

{'score': 0.1773025244474411,
 'start': 58,
 'end': 95,
 'answer': '46 languages natural languages and 13'}