<a href="https://colab.research.google.com/github/JYL480/QnAWithContext/blob/main/QnAWithContext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
try:
  from datasets import load_dataset
except:
  !pip install datasets
  from datasets import load_dataset

In [None]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")
# We want to start off small first, so that the training will not take too long

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
squad = squad.train_test_split(test_size=0.2)

In [None]:
squad['test'][1]

# Now we have both train and test!

{'id': '56cdca7862d2951400fa6828',
 'title': '2008_Sichuan_earthquake',
 'context': 'The 2008 Sichuan earthquake or the Great Sichuan earthquake, measured at 8.0 Ms and 7.9 Mw, and occurred at 02:28:01 PM China Standard Time at epicenter (06:28:01 UTC) on May 12 in Sichuan province, killed 69,197 people and left 18,222 missing.',
 'question': 'How many people were killed as a result?',
 'answers': {'text': ['69,197'], 'answer_start': [206]}}

In [None]:
squad.column_names

{'train': ['id', 'title', 'context', 'question', 'answers'],
 'test': ['id', 'title', 'context', 'question', 'answers']}

In [None]:
# We will use the DistillBERT tokenizer!!

from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
# Now we will preprocess the data!!
# This is a very important step!!

def preprocess_function(examples):
  # We will get all the questions first!
  questions = [q.strip() for q in examples["question"]]
  # we have to strip() which removes the leading spaces, this can for the inputs of user as well!

  # You can put sequences or inputs in pair!
  # Note that for Bert Max sequence length/max_position emebding = 512, but here we reduce the max_sequence length to 384
  inputs = tokenizer(
      text=questions,
      text_pair = examples["context"],
      max_length = 384,
      # If the sequence length is to large we will truncate only the context and not the question, hence we use second!!!
      truncation = "only_second",
      # This offset mapping is where to the starting and end index position of char will be reutnr of each words
      return_offsets_mapping=True,
      # Note that when we do batching later, all the batches need to be of equal lenght, hence we will batch it
      # For good utilisation!!
      padding="max_length"

  )

  offset_mapping = inputs.pop("offset_mapping")
  # Here we will get the start and end of each token for the indexes?
  answers = examples["answers"]
  start_positions = []
  end_positions = []

  for i, offset in enumerate(offset_mapping):
    answer = answers[i]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label it (0, 0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
  return inputs

In [None]:
squad.column_names

{'train': ['id', 'title', 'context', 'question', 'answers'],
 'test': ['id', 'title', 'context', 'question', 'answers']}

In [None]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
tokenized_squad.column_names

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'train': ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
 'test': ['input_ids', 'attention_mask', 'start_positions', 'end_positions']}

In [None]:
tokenized_squad.column_names

{'train': ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
 'test': ['input_ids', 'attention_mask', 'start_positions', 'end_positions']}

In [None]:
squad["train"].column_names
#We will remove all these, because when put into the model, only certain params are accepted
# IMportant to remove!!
# you need to know what the model can accept to train!

['id', 'title', 'context', 'question', 'answers']

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased").to(device)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_squad['train'][0]

{'input_ids': [101,
  2054,
  2095,
  2001,
  3422,
  2386,
  2949,
  1029,
  102,
  2019,
  3041,
  4433,
  1997,
  2000,
  3102,
  1037,
  19545,
  9001,
  1010,
  4159,
  2175,
  2275,
  1037,
  3422,
  2386,
  1010,
  2001,
  6801,
  2135,
  2207,
  2006,
  2251,
  2403,
  1010,
  2325,
  1012,
  2023,
  4433,
  1010,
  2029,
  2001,
  2949,
  1999,
  3890,
  1010,
  2003,
  2275,
  2322,
  2086,
  2044,
  1996,
  2051,
  2558,
  8212,
  1999,
  2000,
  3102,
  1037,
  19545,
  9001,
  2021,
  2003,
  2025,
  1037,
  13633,
  1997,
  1996,
  7984,
  1012,
  2023,
  3041,
  2544,
  1997,
  1996,
  2466,
  4076,
  2019,
  4639,
  7464,
  16133,
  2040,
  7930,
  2013,
  2047,
  2259,
  2000,
  3942,
  2014,
  2269,
  1010,
  14832,
  2271,
  16133,
  1010,
  1999,
  2089,
  18274,
  1010,
  6041,
  1010,
  2073,
  2016,
  2003,
  12892,
  2011,
  1996,
  2046,
  3917,
  6651,
  1999,
  2014,
  2451,
  1012,
  1996,
  3422,
  2386,
  8356,
  2001,
  3373,
  2000,
  2031,
  2042,
  243

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
from torch.utils.data import DataLoader
import os

# Define your DataLoaders with data_collator
train_dataloader = DataLoader(tokenized_squad["train"], batch_size=16, num_workers=os.cpu_count(), collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_squad["test"], batch_size=16, num_workers=os.cpu_count(), collate_fn=data_collator)

batch = next(iter(train_dataloader))
batch['input_ids'].shape

  self.pid = os.fork()


torch.Size([16, 384])

In [None]:
for batch in train_dataloader:
  batch

AttributeError: 'str' object has no attribute 'shape'

In [None]:

optimizer = torch.optim.Adam(params=model.parameters(),
                            lr = 0.00001)

In [None]:
from tqdm.auto import tqdm

num_epochs = 3
model.train()

for epoch in tqdm(range(num_epochs)):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss}")

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch: 0, Loss: 0.3889652192592621
Epoch: 0, Loss: 0.2807663083076477
Epoch: 0, Loss: 0.2565242052078247
Epoch: 0, Loss: 0.30833542346954346
Epoch: 0, Loss: 0.19909779727458954
Epoch: 0, Loss: 0.9272968173027039
Epoch: 0, Loss: 0.2883177399635315
Epoch: 0, Loss: 0.3408273458480835
Epoch: 0, Loss: 0.28644782304763794
Epoch: 0, Loss: 0.20304226875305176
Epoch: 0, Loss: 0.15915578603744507
Epoch: 0, Loss: 0.4435741901397705
Epoch: 0, Loss: 0.44891029596328735
Epoch: 0, Loss: 0.2674829363822937
Epoch: 0, Loss: 0.9196861982345581
Epoch: 0, Loss: 0.19166915118694305
Epoch: 0, Loss: 0.6424776315689087
Epoch: 0, Loss: 0.698432981967926
Epoch: 0, Loss: 0.3994954824447632
Epoch: 0, Loss: 0.3943638801574707
Epoch: 0, Loss: 0.4466205835342407
Epoch: 0, Loss: 0.39587295055389404
Epoch: 0, Loss: 0.26967787742614746
Epoch: 0, Loss: 0.5075386762619019
Epoch: 0, Loss: 0.4790160655975342
Epoch: 0, Loss: 0.2617400884628296
Epoch: 0, Loss: 0.40242433547973633
Epoch: 0, Loss: 0.8530246019363403
Epoch: 0, L

In [None]:
def prediction(question, context):
  inputs = tokenizer(question, context, return_tensors="pt")
  input_ids = inputs["input_ids"].to(device)
  attention_mask = inputs["attention_mask"].to(device)
  outputs = model(input_ids=input_ids, attention_mask=attention_mask)
  start_logits = outputs.start_logits
  end_logits = outputs.end_logits
  start_index = torch.argmax(start_logits)
  end_index = torch.argmax(end_logits)
  answer = tokenizer.decode(input_ids[0][start_index:end_index+1])
  return answer



In [None]:
question = "Who were the first humans to walk on the moon?"
context = "n 1969, Neil Armstrong and Buzz Aldrin became the first humans to walk on the moon during NASA's Apollo 11 mission"

In [None]:
print(prediction(question, context))

neil armstrong and buzz aldrin


In [None]:
import torch
from pathlib import Path

def save_model(model: torch.nn.Module,
               target_dir: str,
               model_name: str):
    """Saves a PyTorch model to a target directory.

    Args:
    model: A target PyTorch model to save.
    target_dir: A directory for saving the model to.
    model_name: A filename for the saved model. Should include
      either ".pth" or ".pt" as the file extension.

    Example usage:
    save_model(model=model_0,
               target_dir="models",
               model_name="05_going_modular_tingvgg_model.pth")
    """
    # Create target directory
    target_dir_path = Path(target_dir)
    target_dir_path.mkdir(parents=True,
                        exist_ok=True)

    # Create model save path
    assert model_name.endswith(".pth") or model_name.endswith(".pt"), "model_name should end with '.pt' or '.pth'"
    model_save_path = target_dir_path / model_name

    # Save the model state_dict()
    print(f"[INFO] Saving model to: {model_save_path}")
    torch.save(obj=model.state_dict(),
             f=model_save_path)
    saved_file_size = model_save_path.stat().st_size / 1024**2
    print(f"\t... saved file size: {saved_file_size:.2f} MB")

In [None]:
save_model(model,"models","QnAModel.pth")

[INFO] Saving model to: models/QnAModel.pth
	... saved file size: 253.20 MB


In [None]:
try:
  import torchinfo #If there is one aldy
except:
  !pip install torchinfo
  import torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
from torchinfo import summary

summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForQuestionAnswering                          --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           1,538
├─Dropout: 1-3                                          --
Total params: 66,364,418
Trainable params: 66,364,418
Non-trainable params: 0

In [None]:
!pip install transformers[torch]



In [None]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [None]:
!pip install accelerate -U



In [None]:
import accelerate
from accelerate import is_accelerate_available

print("PyTorch version:", torch.__version__)
print("Accelerate version:", accelerate.__version__)

ImportError: cannot import name 'is_accelerate_available' from 'accelerate' (/usr/local/lib/python3.10/dist-packages/accelerate/__init__.py)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer


training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.32003
2,2.702800,1.850728
3,2.702800,1.708521


TrainOutput(global_step=750, training_loss=2.2801517333984376, metrics={'train_runtime': 461.8823, 'train_samples_per_second': 25.981, 'train_steps_per_second': 1.624, 'total_flos': 1175877900288000.0, 'train_loss': 2.2801517333984376, 'epoch': 3.0})

In [None]:
question = "What theory is Albert Einstein famous for?"
context = "Albert Einstein's theory of relativity revolutionized our understanding of space, time, and gravity in the early 20th century."

In [None]:
print(prediction(question, context))

relativity
