In [1]:
!pip install datasets
!pip install accelerate -U transformers[torch]

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K

In [2]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-cased")
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased").to(device)

#load SQuAD dataset
dataset = load_dataset("squad")

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [13]:

class SquadDataset(Dataset):
    def __init__(self, tokenizer, examples, max_seq_length=384):
        self.tokenizer = tokenizer
        self.examples = examples
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        context = example['context']
        question = example['question']
        answer_start = example['answers']['answer_start'][0]
        answer_text = example['answers']['text'][0]

        tokenized = self.tokenizer(
            question,
            context,
            truncation=True,
            max_length=self.max_seq_length,
            padding="max_length",
            return_offsets_mapping=True
        )

        char_to_token = {}
        for i, (start, end) in enumerate(tokenized['offset_mapping']):
            if start != end:
                char_to_token[start] = i

        token_start_index = char_to_token.get(answer_start, 0)
        answer_end = answer_start + len(answer_text)
        token_end_index = char_to_token.get(answer_end - 1, token_start_index)

        tokenized["start_positions"] = token_start_index
        tokenized["end_positions"] = token_end_index

        #convert to tensors
        return {
            "input_ids": torch.tensor(tokenized["input_ids"]),
            "attention_mask": torch.tensor(tokenized["attention_mask"]),
            "start_positions": torch.tensor(token_start_index),
            "end_positions": torch.tensor(token_end_index)
        }


train_data = dataset["train"].select(range(7000))
train_dataset = SquadDataset(tokenizer, train_data)

In [16]:

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    per_device_train_batch_size=4,
    num_train_epochs=4,
    learning_rate=3e-05,
    output_dir='./results',
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=1,
    lr_scheduler_type="linear",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

optimizer = AdamW(
    model.parameters(),
    lr=training_args.learning_rate,
    betas=(0.9, 0.999),
    eps=1e-08
)

total_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, scheduler)
)

trainer.train()

#save the fine-tuned model
model.save_pretrained("./distilbert-squad")
tokenizer.save_pretrained("./distilbert-squad")

model.eval()



Step,Training Loss
500,3.0087
1000,2.3889
1500,2.2155
2000,1.6377
2500,1.2707
3000,1.2916
3500,1.2449
4000,0.6749
4500,0.6777
5000,0.7254


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [40]:

def generate_answer(context, question):
    inputs = tokenizer(question, context, return_tensors="pt", max_length=384, truncation=True, padding="max_length").to(device)
    outputs = model(**inputs)
    start_logits, end_logits = outputs.start_logits, outputs.end_logits

    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    if end_index <= start_index:
        end_index = start_index + 1  # extending answer span
    end_index = min(end_index, len(inputs["input_ids"][0]) - 1)

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    answer = tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])

    return answer

validation_data = dataset["validation"]
for example in validation_data.select(range(7)): 
    context = example['context']
    question = example['question']
    generated_answer = generate_answer(context, question)
    ground_truth = example['answers']['text'][0]

    print(f"Context: {context}")
    print(f"Question: {question}")
    print(f"Generated Answer: {generated_answer}")
    print(f"Ground Truth: {ground_truth}\n")

Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.
Question: Which NFL team represented the AFC at Super Bowl 50?
Generated Answer: Denver Broncos
Ground Truth: Denver Broncos

Context: Super Bowl 50 was an American football game to determine the champion of the Nat

In [41]:
!pip install Levenshtein
import Levenshtein

#calculate similarity between two strings using Levenshtein distance
def calculate_similarity(string1, string2):
    return 1 - (Levenshtein.distance(string1.lower(), string2.lower()) / max(len(string1), len(string2)))

total_similarity = 0

with open('distilbert_SQUAD.txt', 'a') as file:
  for i, example in enumerate(validation_data.select(range(100))):
    context = example['context']
    question = example['question']
    generated_answer = generate_answer(context, question)
    true_answers = example['answers']['text']

    file.write(f"Question: {question}\n")
    file.write(f"Generated Answer: {generated_answer}\n")
    file.write(f"Correct Answer: {true_answers[0]}\n\n")

    max_similarity = 0
    for true_answer in true_answers:
        similarity = calculate_similarity(generated_answer, true_answer)
        if similarity > max_similarity:
            max_similarity = similarity
    total_similarity += max_similarity

with open('distilbert_SQUAD.txt', 'a') as file:
    average_similarity = total_similarity / 100
    file.write(f"Average Similarity: {average_similarity:.2%}%\n")
    print(f"Average Similarity: {average_similarity:.2%}")


Average Similarity: 54.73%
