In [3]:
import json

def read_squad(path):
    # Open the JSON file and load it into a dictionary
    with open(path, 'r') as f:
        squad_dict = json.load(f)

    # Extract the context, question, and answer data from the dictionary
    contexts = [passage['context'] for group in squad_dict['data']
                                    for passage in group['paragraphs']]
    questions = [qa['question'] for group in squad_dict['data']
                                for passage in group['paragraphs']
                                for qa in passage['qas']]
    answers = [answer for group in squad_dict['data']
                    for passage in group['paragraphs']
                    for qa in passage['qas']
                    for answer in qa.get('plausible_answers', qa['answers'])]

    # Return the extracted data lists
    return contexts, questions, answers

# Execute the read_squad function on the training and validation sets
train_contexts, train_questions, train_answers = read_squad('C:/Users/Jayanth/Deeplearning/DeepLearningHW3/dataset/spoken_train-v1.1.json')
val_contexts, val_questions, val_answers = read_squad('C:/Users/Jayanth/Deeplearning/DeepLearningHW3/dataset/spoken_test-v1.1.json')

In [5]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # extract relevant information
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # check if the answer is already correct
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            # search for a matching answer within a range of 3 characters to account for small differences
            for offset in range(-2, 3):
                if offset == 0:
                    continue
                if context[start_idx+offset:end_idx+offset] == gold_text:
                    answer['answer_start'] = start_idx + offset
                    answer['answer_end'] = end_idx + offset
                    break


In [6]:
from transformers import DistilBertTokenizerFast

# initialize the tokenizer with additional arguments
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', return_attention_mask=False, return_tensors='pt')

# tokenize and convert to PyTorch tensors
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
train_encodings = {key: val.to('cuda') for key, val in train_encodings.items()}  # move to GPU if available

val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)
val_encodings = {key: val.to('cuda') for key, val in val_encodings.items()}  # move to GPU if available


In [7]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    
    # iterate through each answer in the dataset
    for i, answer in enumerate(answers):
        # get the start and end character positions of the answer text
        start_char_pos = answer['answer_start']
        end_char_pos = start_char_pos + len(answer['text'])
        
        # find the token indices of the start and end character positions, or use model_max_length if not found
        start_token_pos = encodings.char_to_token(i, start_char_pos) or encodings.model_max_length
        end_token_pos = encodings.char_to_token(i, end_char_pos - 1) or encodings.model_max_length
        
        # append the token positions to the lists
        start_positions.append(start_token_pos)
        end_positions.append(end_token_pos)
    
    # update the encodings object with the token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [8]:
import torch
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [9]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [10]:
##Data Preprocessing 

In [11]:
from transformers import BertForQuestionAnswering
from transformers import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm

# move model over to detected device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# activate training mode of model
model.train()

# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=2e-6)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# set number of batches to accumulate gradients over
accumulation_steps = 4

for epoch in range(3):
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    # initialize calculated gradients (from prev step)
    accum_loss = 0
    accum_count = 0
    for i, batch in enumerate(loop):
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate gradients for every parameter that needs grad update
        loss.backward()
        # accumulate gradients over multiple batches
        accum_loss += loss.item()
        accum_count += 1
        if (i+1) % accumulation_steps == 0:
            # update parameters
            optim.step()
            # reset gradients to zero
            optim.zero_grad()
            # print relevant info to progress bar
            avg_loss = accum_loss / accum_count
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=avg_loss)
            # reset accumulation variables
            accum_loss = 0
            accum_count = 0
    
    # update parameters one last time after accumulating gradients
    if accum_count > 0:
        optim.step()
        optim.zero_grad()
        # print relevant info to progress bar
        avg_loss = accum_loss / accum_count
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=avg_loss)


Epoch 0: 100%|██████████| 2320/2320 [14:16<00:00,  2.71it/s, loss=3.21]
Epoch 1: 100%|██████████| 2320/2320 [14:14<00:00,  2.72it/s, loss=2.98]
Epoch 2: 100%|██████████| 2320/2320 [14:14<00:00,  2.72it/s, loss=2.54]


In [14]:
import numpy as np

# set the model to evaluation mode
model.eval()

# create a data loader for the validation dataset with a batch size of 16
val_loader = DataLoader(val_dataset, batch_size=16)

# get the number of samples in the validation dataset
n_samples = len(val_dataset)

# create numpy arrays to store the true and predicted start and end positions
true_starts = np.zeros(n_samples, dtype=np.int32)
true_ends = np.zeros(n_samples, dtype=np.int32)
pred_starts = np.zeros(n_samples, dtype=np.int32)
pred_ends = np.zeros(n_samples, dtype=np.int32)

# initialize a progress bar for the loop
loop = tqdm(val_loader)

# loop through the batches in the validation dataset
for i, batch in enumerate(loop):

    # turn off gradient computation as we are not training the model
    with torch.no_grad():

        # get the inputs and targets from the batch
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # make predictions for the batch
        outputs = model(input_ids, attention_mask=attention_mask)

        # get the predicted start and end positions for the batch
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # update the true and predicted start and end positions in the numpy arrays
        batch_size = len(start_true)
        idx = i * batch_size
        true_starts[idx:idx+batch_size] = start_true.cpu().numpy()
        true_ends[idx:idx+batch_size] = end_true.cpu().numpy()
        pred_starts[idx:idx+batch_size] = start_pred.cpu().numpy()
        pred_ends[idx:idx+batch_size] = end_pred.cpu().numpy()

# calculate true positives, false positives, and false negatives
true_pos_starts = np.sum((true_starts == pred_starts) & (true_starts != -1))
true_pos_ends = np.sum((true_ends == pred_ends) & (true_ends != -1))
false_pos_starts = np.sum((true_starts != pred_starts) & (pred_starts != -1))
false_pos_ends = np.sum((true_ends != pred_ends) & (pred_ends != -1))
false_neg_starts = np.sum((true_starts != pred_starts) & (true_starts != -1))
false_neg_ends = np.sum((true_ends != pred_ends) & (true_ends != -1))

# calculate precision and recall
precision_starts = true_pos_starts / (true_pos_starts + false_pos_starts + 1e-9)
recall_starts = true_pos_starts / (true_pos_starts + false_neg_starts + 1e-9)
precision_ends = true_pos_ends / (true_pos_ends + false_pos_ends + 1e-9)
recall_ends = true_pos_ends / (true_pos_ends + false_neg_ends + 1e-9)

# calculate F1 score
f1_starts = 2 * (precision_starts * recall_starts) / (precision_starts + recall_starts + 1e-9)
f1_ends = 2 * (precision_ends * recall_ends) / (precision_ends + recall_ends + 1e-9)
f1 = (f1_starts + f1_ends) / 2

# print the F1 score
print("F1 - {:.4f}".format(f1))


100%|██████████| 993/993 [01:56<00:00,  8.51it/s]

F1 - 0.4293





In [None]:
#pre processing ends

In [None]:
#Model fine tuning

In [20]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from accelerate import Accelerator

# check if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# set up tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# set up model
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

# move model to device
model.to(device)

# initialize optimizer
optim = AdamW(model.parameters(), lr=2e-6)

# set up data loader for training data
train_dataset = ...  # create your own train dataset
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# calculate number of training steps
num_training_steps = len(train_loader) * 3

# initialize scheduler
scheduler = get_linear_schedule_with_warmup(
    optim, num_warmup_steps=0, num_training_steps=num_training_steps
)

# initialize accelerator to speed up training
accelerator = Accelerator()

# prepare model, optimizer, and scheduler for training with accelerator
model, optimizer, train_loader, scheduler = accelerator.prepare(
    model, optim, train_loader, scheduler
)

# loop over training data for multiple epochs
for epoch in range(3):
    # set model to training mode
    model.train()
    # set up progress bar
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optimizer.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs.loss
        # backpropagate gradients using the accelerator
        accelerator.backward(loss)
        # update model parameters
        optimizer.step()
        # update learning rate using the scheduler
        scheduler.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item(), lr=optimizer.param_groups[0]['lr'])
        
        # print every 500 steps
        if (loop.n % 500 == 0):
            print(f"Batch {loop.n}/{len(train_loader)} - Loss: {loss.item()}, Learning Rate: {optimizer.param_groups[0]['lr']}")

            
#Evaluation
import re
import string
from collections import Counter

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    s = s.lower().strip()
    s = re.sub(r'\b(a|an|the)\b', ' ', s)
    s = re.sub(r'[{}]'.format(string.punctuation), ' ', s)
    s = ' '.join(s.split())
    return s

def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = [metric_fn(prediction, ground_truth) for ground_truth in ground_truths]
    return max(scores_for_ground_truths, default=0)

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(
            exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
            f1_score, prediction, [ground_truths])

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'f1': f1, 'exact_match': exact_match}


Epoch 0:   0%|          | 1/2320 [00:00<14:30,  2.67it/s, loss=1.51, lr=2e-6]

Batch 0/2320 - Loss: 1.5059566497802734, Learning Rate: 1.999971264367816e-06


Epoch 0: 100%|██████████| 2320/2320 [14:11<00:00,  2.72it/s, loss=1.48, lr=1.93e-6] 
Epoch 1:   0%|          | 1/2320 [00:00<14:20,  2.70it/s, loss=2.4, lr=1.93e-6]

Batch 0/2320 - Loss: 2.3994789123535156, Learning Rate: 1.933304597701149e-06


Epoch 1: 100%|██████████| 2320/2320 [14:09<00:00,  2.73it/s, loss=1.25, lr=1.87e-6] 
Epoch 2:   0%|          | 1/2320 [00:00<14:48,  2.61it/s, loss=1.27, lr=1.87e-6]

Batch 0/2320 - Loss: 1.2688286304473877, Learning Rate: 1.8666379310344826e-06


Epoch 2: 100%|██████████| 2320/2320 [14:09<00:00,  2.73it/s, loss=1.01, lr=1.8e-6]  

f1 = 0.4378



