# **Setup**

### Imports and Installation

In [1]:
!pip install transformers

import tensorflow as tf
import torch
import os
import timeit
from transformers.data.processors.squad import SquadResult, SquadV2Processor, squad_convert_examples_to_features
from transformers import BertConfig, BertTokenizer, BertForQuestionAnswering, get_linear_schedule_with_warmup, AdamW
from transformers.data.metrics.squad_metrics import compute_predictions_logits, squad_evaluate
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ee/fc/bd726a15ab2c66dc09306689d04da07a3770dad724f0883f0a4bfb745087/transformers-2.4.1-py3-none-any.whl (475kB)
[K     |████████████████████████████████| 481kB 3.4MB/s 
Collecting tokenizers==0.0.11
[?25l  Downloading https://files.pythonhosted.org/packages/5e/36/7af38d572c935f8e0462ec7b4f7a46d73a2b3b1a938f50a5e8132d5b2dc5/tokenizers-0.0.11-cp36-cp36m-manylinux1_x86_64.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 55.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 45.6MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |██

### GPU configuration

In [2]:
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [3]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


# **Loading & Processing the SQuAD 2.0 Dataset**


## Downloading Squad files :

In [4]:
#Download the SQUAD train and dev dataset
!mkdir squaddir
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squaddir/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squaddir/dev-v2.0.json
!wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O squaddir/evaluate-v2.0.py

--2020-02-06 16:26:39--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.110.153, 185.199.111.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squaddir/train-v2.0.json’


2020-02-06 16:26:40 (70.6 MB/s) - ‘squaddir/train-v2.0.json’ saved [42123633/42123633]

--2020-02-06 16:26:41--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.110.153, 185.199.111.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squaddir/dev-v2.0.json’


2020-02-06 16:26:41 (15.0 MB/s) - ‘squaddir/dev-v2.0.json’ saved [437052

## BERT Model Loading

In [0]:
# PARAMETERS

pretrained_weights = 'bert-base-uncased'
max_seq_length = 384
max_answer_length = 30
max_query_length = 64
doc_stride = 128
is_training = True
return_dataset = "pt"
learning_rate = 3e-5
num_train_epochs = 4
output_dir = "./finetuned_squad/"
eval_batch_size=10
train_batch_size=10
gradient_accumulation_steps = 1
weight_decay = 0.0
adam_epsilon = 1e-8
warmup_steps = 0
logging_steps = 500
max_grad_norm = 1.0
verbose_logging = True
version_2_with_negative = True
null_score_diff_threshold = 0.0
do_lower_case = True
n_best_size = 20

In [6]:
# Loading the model config
print("Bert config loading ...")
config = BertConfig.from_pretrained(pretrained_weights)

# Loading the model
print("BertForQuestionAnswering model loading ...")
model = BertForQuestionAnswering.from_pretrained(pretrained_weights)
model.cuda()

# Loading the tokenizer
print("BertTokenizer loading ...")
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

Bert config loading ...


HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…


BertForQuestionAnswering model loading ...


HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…


BertTokenizer loading ...


HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [7]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

## Data processing

In [8]:
##############################
## TRAIN DATASET PROCESSING ##
##############################

# Squad V2 processor loading
print("\nSquad V2 processor loading : ")
processor = SquadV2Processor()

# Extract examples from the train dataset
print("\nExtract examples from the train dataset : ")
examples = processor.get_train_examples("squaddir")

print("\nNumber of train examples total: ", len(examples))

examples = examples[:10000]
print("\nNumber of train examples taken: ", len(examples))

features, dataset = squad_convert_examples_to_features(
    examples=examples,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    max_query_length=max_query_length,
    is_training=is_training,
    return_dataset=return_dataset
)


Squad V2 processor loading : 

Extract examples from the train dataset : 


100%|██████████| 442/442 [00:40<00:00, 10.87it/s]



Number of train examples total:  130319

Number of train examples taken:  10000


convert squad examples to features: 100%|██████████| 10000/10000 [01:04<00:00, 156.02it/s]
add example index and unique id: 100%|██████████| 10000/10000 [00:00<00:00, 831807.08it/s]


In [9]:
#############################
## TEST DATASET PROCESSING ##
#############################

# Extract examples from the test dataset
print("\nExtract examples from the test dataset : ")
dev_examples = processor.get_dev_examples("squaddir")

print("\nNumber of test examples total: ", len(dev_examples))

# dev_examples = dev_examples[:1000]
# print("Number of test examples taken: ", len(dev_examples))

dev_features, dev_dataset = squad_convert_examples_to_features(
    examples=dev_examples,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
    doc_stride=doc_stride,
    max_query_length=max_query_length,
    is_training=not is_training,
    return_dataset=return_dataset
)


Extract examples from the test dataset : 


100%|██████████| 35/35 [00:03<00:00,  7.65it/s]


Number of test examples total:  11873



convert squad examples to features: 100%|██████████| 11873/11873 [01:31<00:00, 129.63it/s]
add example index and unique id: 100%|██████████| 11873/11873 [00:00<00:00, 763120.76it/s]


In [0]:
# print("dataset 0 : ", dataset[0])

# feat = features[0]

# print("features 0 : ", feat)
# print("input_ids : ", feat.input_ids)
# print("attention_mask : ", feat.attention_mask)
# print("token_type_ids : ", feat.token_type_ids)
# print("cls_index : ", feat.cls_index)
# print("p_mask : ", feat.p_mask)
# print("example_index : ", feat.example_index)
# print("unique_id : ", feat.unique_id)
# print("paragraph_len : ", feat.paragraph_len)
# print("token_is_max_context : ", feat.token_is_max_context)
# print("tokens : ", feat.tokens)
# print("token_to_orig_map : ", feat.token_to_orig_map)
# print("start_position : ", feat.start_position)
# print("end_position : ", feat.end_position)
# print("is_impossible : ", feat.is_impossible)

# **Training the model**

In [0]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [0]:
loss_tab = []

def train(train_dataset, model, tokenizer):
    """ Train the model """

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)

    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total
    )

    # Train!
    print("***** Running training *****")
    print("  Num examples = ", len(train_dataset))
    print("  Num Epochs = ", num_train_epochs)
    print("  Gradient Accumulation steps = ", gradient_accumulation_steps)
    print("  Total optimization steps = ", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()

    train_iterator = trange(
        epochs_trained, int(num_train_epochs), desc="Epoch"
    )

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            loss.backward()

            tr_loss += loss.item()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1
        
        loss_tab.append(tr_loss / global_step)

    return global_step, tr_loss / global_step

In [0]:
global_step, tr_loss = train(dataset, model, tokenizer)
print(" global_step = ", global_step, ", average loss = ",  tr_loss)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1015 [00:00<?, ?it/s][A

***** Running training *****
  Num examples =  10150
  Num Epochs =  4
  Gradient Accumulation steps =  1
  Total optimization steps =  4060



Iteration:   0%|          | 1/1015 [00:01<17:53,  1.06s/it][A
Iteration:   0%|          | 2/1015 [00:01<16:40,  1.01it/s][A
Iteration:   0%|          | 3/1015 [00:02<15:46,  1.07it/s][A
Iteration:   0%|          | 4/1015 [00:03<15:14,  1.11it/s][A
Iteration:   0%|          | 5/1015 [00:04<14:50,  1.13it/s][A
Iteration:   1%|          | 6/1015 [00:05<14:34,  1.15it/s][A
Iteration:   1%|          | 7/1015 [00:06<14:22,  1.17it/s][A
Iteration:   1%|          | 8/1015 [00:06<14:14,  1.18it/s][A
Iteration:   1%|          | 9/1015 [00:07<14:08,  1.19it/s][A
Iteration:   1%|          | 10/1015 [00:08<14:04,  1.19it/s][A
Iteration:   1%|          | 11/1015 [00:09<14:05,  1.19it/s][A
Iteration:   1%|          | 12/1015 [00:10<14:04,  1.19it/s][A
Iteration:   1%|▏         | 13/1015 [00:11<14:02,  1.19it/s][A
Iteration:   1%|▏         | 14/1015 [00:11<13:58,  1.19it/s][A
Iteration:   1%|▏         | 15/1015 [00:12<13:57,  1.19it/s][A
Iteration:   2%|▏         | 16/1015 [00:13<13:53

In [0]:
import matplotlib.pyplot as plt
% matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(loss_tab, 'b-o')

# Label the plot.
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.show()

# **Evaluating the Model**

In [0]:
def evaluate(model, tokenizer, prefix=""):

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dev_dataset)
    eval_dataloader = DataLoader(dev_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

    # Eval!
    print("***** Running evaluation *****")
    print("  Num examples = ", len(dev_dataset))
    print("  Batch size = ", eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]
            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = dev_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    print("  Evaluation done in total" , evalTime," secs (", evalTime / len(dev_dataset)," sec per example)")

    # Compute predictions
    output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
    output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))

    predictions = compute_predictions_logits(
        dev_examples,
        dev_features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        verbose_logging,
        version_2_with_negative,
        null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(dev_examples, predictions)
    return results

In [0]:
results = evaluate(model, tokenizer)
print(results)

In [0]:
%run squaddir/evaluate-v2.0.py squaddir/dev-v2.0.json ./finetuned_squad/predictions_.json

In [0]:
# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# **Other Stuff and little tests** (not necessarily working)

In [0]:
# import random

# # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# # I believe the 'W' stands for 'Weight Decay fix"
# optimizer = AdamW(model.parameters(),
#                   lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
#                   eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
#                 )

# from transformers import get_linear_schedule_with_warmup

# # Number of training epochs (authors recommend between 2 and 4)
# epochs = 4

# # Total number of training steps is number of batches * number of epochs.
# total_steps = len(train_dataloader) * epochs

# # Create the learning rate scheduler.
# scheduler = get_linear_schedule_with_warmup(optimizer, 
#                                             num_warmup_steps = 0, # Default value in run_glue.py
         

# # This training code is based on the `run_glue.py` script here:
# # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# # Set the seed value all over the place to make this reproducible.
# seed_val = 42

# random.seed(seed_val)
# np.random.seed(seed_val)
# torch.manual_seed(seed_val)
# torch.cuda.manual_seed_all(seed_val)

# # Store the average loss after each epoch so we can plot them.
# loss_values = []

# # For each epoch...
# for epoch_i in range(epochs):
    
#     # ========================================
#     #               Training
#     # ========================================
    
#     # Perform one full pass over the training set.

#     print("")
#     print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
#     print('Training...')

#     # Measure how long the training epoch takes.
#     t0 = time.time()

#     # Reset the total loss for this epoch.
#     total_loss = 0

#     # Put the model into training mode. Don't be mislead--the call to 
#     # `train` just changes the *mode*, it doesn't *perform* the training.
#     # `dropout` and `batchnorm` layers behave differently during training
#     # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
#     model.train()

#     # For each batch of training data...
#     for step, batch in enumerate(train_dataloader):

#         # Progress update every 40 batches.
#         if step % 40 == 0 and not step == 0:
#             # Calculate elapsed time in minutes.
#             elapsed = format_time(time.time() - t0)
            
#             # Report progress.
#             print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

#         # Unpack this training batch from our dataloader. 
#         #
#         # As we unpack the batch, we'll also copy each tensor to the GPU using the 
#         # `to` method.
#         #
#         # `batch` contains three pytorch tensors:
#         #   [0]: input ids 
#         #   [1]: attention masks
#         #   [2]: labels 
#         b_input_ids = batch[0].to(device)
#         b_input_mask = batch[1].to(device)
#         b_labels = batch[2].to(device)

#         # Always clear any previously calculated gradients before performing a
#         # backward pass. PyTorch doesn't do this automatically because 
#         # accumulating the gradients is "convenient while training RNNs". 
#         # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
#         model.zero_grad()        

#         # Perform a forward pass (evaluate the model on this training batch).
#         # This will return the loss (rather than the model output) because we
#         # have provided the `labels`.
#         # The documentation for this `model` function is here: 
#         # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
#         outputs = model(b_input_ids, 
#                     token_type_ids=None, 
#                     attention_mask=b_input_mask, 
#                     labels=b_labels)
        
#         # The call to `model` always returns a tuple, so we need to pull the 
#         # loss value out of the tuple.
#         loss = outputs[0]

#         # Accumulate the training loss over all of the batches so that we can
#         # calculate the average loss at the end. `loss` is a Tensor containing a
#         # single value; the `.item()` function just returns the Python value 
#         # from the tensor.
#         total_loss += loss.item()

#         # Perform a backward pass to calculate the gradients.
#         loss.backward()

#         # Clip the norm of the gradients to 1.0.
#         # This is to help prevent the "exploding gradients" problem.
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

#         # Update parameters and take a step using the computed gradient.
#         # The optimizer dictates the "update rule"--how the parameters are
#         # modified based on their gradients, the learning rate, etc.
#         optimizer.step()

#         # Update the learning rate.
#         scheduler.step()

#     # Calculate the average loss over the training data.
#     avg_train_loss = total_loss / len(train_dataloader)            
    
#     # Store the loss value for plotting the learning curve.
#     loss_values.append(avg_train_loss)

#     print("")
#     print("  Average training loss: {0:.2f}".format(avg_train_loss))
#     print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
#     # ========================================
#     #               Validation
#     # ========================================
#     # After the completion of each training epoch, measure our performance on
#     # our validation set.

#     print("")
#     print("Running Validation...")

#     t0 = time.time()

#     # Put the model in evaluation mode--the dropout layers behave differently
#     # during evaluation.
#     model.eval()

#     # Tracking variables 
#     eval_loss, eval_accuracy = 0, 0
#     nb_eval_steps, nb_eval_examples = 0, 0

#     # Evaluate data for one epoch
#     for batch in validation_dataloader:
        
#         # Add batch to GPU
#         batch = tuple(t.to(device) for t in batch)
        
#         # Unpack the inputs from our dataloader
#         b_input_ids, b_input_mask, b_labels = batch
        
#         # Telling the model not to compute or store gradients, saving memory and
#         # speeding up validation
#         with torch.no_grad():        

#             # Forward pass, calculate logit predictions.
#             # This will return the logits rather than the loss because we have
#             # not provided labels.
#             # token_type_ids is the same as the "segment ids", which 
#             # differentiates sentence 1 and 2 in 2-sentence tasks.
#             # The documentation for this `model` function is here: 
#             # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
#             outputs = model(b_input_ids, 
#                             token_type_ids=None, 
#                             attention_mask=b_input_mask)
        
#         # Get the "logits" output by the model. The "logits" are the output
#         # values prior to applying an activation function like the softmax.
#         logits = outputs[0]

#         # Move logits and labels to CPU
#         logits = logits.detach().cpu().numpy()
#         label_ids = b_labels.to('cpu').numpy()
        
#         # Calculate the accuracy for this batch of test sentences.
#         tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
#         # Accumulate the total accuracy.
#         eval_accuracy += tmp_eval_accuracy

#         # Track the number of batches
#         nb_eval_steps += 1

#     # Report the final accuracy for this validation run.
#     print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
#     print("  Validation took: {:}".format(format_time(time.time() - t0)))

# print("")
# print("Training complete!")

In [0]:
ex = examples[0]

print("question : ", ex.question_text)
print("text : ", ex.context_text)
print("answer : ", ex.answer_text)

In [0]:
#For single sequence input
sentence = 'I really enjoyed this movie a lot.'
tokens = tokenizer.tokenize(sentence)
print(tokens)

In [0]:
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)

In [0]:
T = 12
padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
print(padded_tokens)
# Out: ['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]', '[PAD]', '[PAD]']
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
print(attn_mask)

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text)
print(input_ids)

token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
print(token_type_ids)

start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
print(start_scores, end_scores)

all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
assert answer == "a nice puppet"
print(answer)