In [1]:
!pip install transformers datasets evaluate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker, sacrebleu, evaluate
Successfully installed evaluate-0.4.0 portalocker-2.7.0 sacrebleu-2.3.1
[0m

In [2]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Data Loading

In [4]:
from datasets import load_dataset
dataset = load_dataset("tydiqa",'primary_task')

  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 166916
    })
    validation: Dataset({
        features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 18670
    })
})

In [6]:
# large_ds = dataset['train']
small_ds = dataset['validation']

# large_ds.add_column('id', list(range(1, len(large_ds)+1)))
small_ds.add_column('id', list(range(1, len(small_ds)+1)))

Dataset({
    features: ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'id'],
    num_rows: 18670
})

In [7]:
small_ds[0].keys()  # 'passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'
small_ds[0]['question_text']  # question
small_ds[0]['passage_answer_candidates']  # list of start + list of end
small_ds[0]['language']  # language
small_ds[0]['document_plaintext']  # context
;

''

## Preprocess

In [8]:
from transformers import CanineTokenizer
tokenizer = CanineTokenizer.from_pretrained("google/canine-c")

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/657 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/892 [00:00<?, ?B/s]

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


In [9]:
def preprocess_function(examples):
    max_length = 2048
    questions = [question.strip()[:max_length//4] for question in examples['question_text']]
    inputs = tokenizer(
        questions,
        examples['document_plaintext'],
        max_length=max_length, truncation='only_second', padding='max_length'
    )

    answers = examples["passage_answer_candidates"]
    start_positions = []
    end_positions = []
    
    for i, answer in enumerate(answers):
        start_pos, end_pos = 0, 0
        for start, end in zip(answer['plaintext_start_byte'], answer['plaintext_end_byte']):
            offset = len(questions[i])+2
            start += offset
            end += offset
            if end <= max_length:
                start_pos = start
                end_pos = end
                break
        start_positions.append(start_pos)
        end_positions.append(end_pos)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
removed_columns = ['passage_answer_candidates', 'question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url']
# tokenized_large_ds = large_ds.map(preprocess_function, batched=True, remove_columns=removed_columns)
tokenized_small_ds = small_ds.map(preprocess_function, batched=True, remove_columns=removed_columns)

  0%|          | 0/19 [00:00<?, ?ba/s]

In [11]:
from torch.utils.data import DataLoader

split_ds = tokenized_small_ds.train_test_split(0.5)
train_ds = split_ds['train']
validtest_ds = split_ds['test']
splitsplit_ds = validtest_ds.train_test_split(0.25)
valid_ds = splitsplit_ds['train']
test_ds = splitsplit_ds['test']

train_ds.set_format("torch")
valid_ds.set_format("torch")
test_ds.set_format("torch")

train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=4)
valid_dataloader = DataLoader(valid_ds, shuffle=False, batch_size=4)
test_dataloader = DataLoader(test_ds, shuffle=False, batch_size=4)

del split_ds
del splitsplit_ds

## Model

In [12]:
from transformers import CanineForQuestionAnswering

model = CanineForQuestionAnswering.from_pretrained("google/canine-c")
model.to(device);



Downloading (…)lve/main/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/529M [00:00<?, ?B/s]

Some weights of CanineForQuestionAnswering were not initialized from the model checkpoint at google/canine-c and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

In [13]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [14]:
# from tqdm.auto import tqdm
# from torch.optim import AdamW
# from transformers import get_scheduler

# num_epochs = 3

# optimizer = AdamW(model.parameters(), lr=5e-5)

# num_training_steps = num_epochs * len(test_dataloader)
# lr_scheduler = get_scheduler(
#     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
# )

# progress_bar = tqdm(range(num_training_steps))

# model.train()
# for epoch in range(num_epochs):
#     for batch in test_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=valid_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,3.9359,3.064935
2,2.8358,2.651223
3,2.4874,2.49968
4,2.321,2.431093




TrainOutput(global_step=3504, training_loss=2.8081854848556866, metrics={'train_runtime': 7103.4279, 'train_samples_per_second': 3.942, 'train_steps_per_second': 0.493, 'total_flos': 3.678788794957824e+16, 'train_loss': 2.8081854848556866, 'epoch': 4.0})

## Evaluate

In [16]:
import evaluate

metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [17]:
predictions = trainer.predict(test_ds)
start_logits, end_logits = predictions.predictions



In [19]:
import torch.nn.functional as F
predicted_starts = F.softmax(torch.Tensor(start_logits), dim=1)
predicted_starts = torch.argmax(predicted_starts, dim=1)

predicted_ends = F.softmax(torch.Tensor(end_logits), dim=1)
predicted_ends = torch.argmax(predicted_ends, dim=1)

predicted_answers = []
theoritical_answers = []
for i in range(len(test_ds)):
    predicted_start = predicted_starts[i]
    predicted_end = predicted_ends[i]
    
    theoritical_start = test_ds['start_positions'][i]
    theoritical_end = test_ds['end_positions'][i]
    
    beg_context = torch.argmin(1-test_ds['token_type_ids'][i])+1
    
    predicted_text = tokenizer.decode(test_ds['input_ids'][i][beg_context+predicted_start:beg_context+predicted_end])
    theoritical_text = tokenizer.decode(test_ds['input_ids'][i][beg_context+theoritical_start:beg_context+theoritical_end])
    
    pred = {'id': str(i), 'prediction_text': predicted_text}
    theory = {'id': str(i), 'answers': {'text': [theoritical_text], 'answer_start': [theoritical_start]}}
    
    predicted_answers.append(pred)
    theoritical_answers.append(theory)

In [24]:
metric.compute(predictions=predicted_answers, references=theoritical_answers)

{'exact_match': 19.70865467009426, 'f1': 72.90260511394887}

__________________________