In [1]:
import json
import torch
from tqdm import tqdm
from transformers import AdamW
from typing import Tuple, List, Dict
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForQuestionAnswering

device = ('cuda')

In [2]:
def extract_squad_dataset(json_path: str) -> Tuple[List[str], List[str], List[Dict]]:
    # Read SQuAD formatted dataset from JSON
    json_content = None
    with open(json_path, 'rb') as squad_file:
        json_content = json.load(squad_file)
    
    # Initialize collection containers
    extracted_contexts = []
    extracted_questions = []
    extracted_answers = []
    
    # Traverse the nested JSON structure
    for data_entry in json_content['data']:
        for paragraph_data in data_entry['paragraphs']:
            paragraph_text = paragraph_data['context']
            
            # Process each Q&A pair
            for qa_pair in paragraph_data['qas']:
                current_question = qa_pair['question']
                
                # Handle different answer formats
                answer_field = 'plausible_answers' \
                    if 'plausible_answers' in qa_pair \
                    else 'answers'
                
                # Store each answer variation
                for ans in qa_pair[answer_field]:
                    extracted_contexts.append(paragraph_text)
                    extracted_questions.append(current_question)
                    extracted_answers.append(ans)
    
    return extracted_contexts, extracted_questions, extracted_answers

TRAIN_PATH = 'Spoken-SQuAD-master/spoken_train-v1.1.json'
VAL_PATH = 'Spoken-SQuAD-master/spoken_test-v1.1.json'

# Load training and validation data
train_contexts, train_questions, train_answers = extract_squad_dataset(TRAIN_PATH)
val_contexts, val_questions, val_answers = extract_squad_dataset(VAL_PATH)

In [3]:
def calculate_span_boundaries(answer_set: List[Dict], context_set: List[str]) -> None:
   """
   Enhance answer dictionaries with end positions and handle alignment corrections.
   Updates the answer dictionaries in-place with validated span indices.
   """
   for answer_dict, context_text in zip(answer_set, context_set):
       # Extract answer metadata
       target_text = answer_dict['text']
       initial_pos = answer_dict['answer_start']
       predicted_end = initial_pos + len(target_text)
       
       # Verify span alignment
       extracted_span = context_text[initial_pos:predicted_end]
       
       if target_text == extracted_span:
           # Direct match found
           answer_dict['answer_end'] = predicted_end
           continue
           
       # Handle misalignment with small offsets
       MAX_OFFSET = 2
       for shift in range(1, MAX_OFFSET + 1):
           adjusted_start = initial_pos - shift
           adjusted_end = predicted_end - shift
           
           candidate_span = context_text[adjusted_start:adjusted_end]
           if candidate_span == target_text:
               # Update with corrected boundaries
               answer_dict['answer_start'] = adjusted_start
               answer_dict['answer_end'] = adjusted_end
               break

# Process answer spans for both datasets
calculate_span_boundaries(train_answers, train_contexts)
calculate_span_boundaries(val_answers, val_contexts)

In [4]:
# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Create encodings for training and validation sets
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [5]:
def map_char_to_token_positions(encoded_data, answer_data: List[Dict]) -> None:
   """
   Convert character-level answer spans to token indices in the encoded sequences.
   Updates the encoded_data object with token-level answer boundaries.
   """
   token_starts = []
   token_ends = []
   
   # Process each answer span
   for idx, answer_info in enumerate(answer_data):
       # Map character positions to tokens
       start_token = encoded_data.char_to_token(idx, answer_info['answer_start'])
       end_token = encoded_data.char_to_token(idx, answer_info['answer_end'])
       
       # Handle edge case for start position
       if start_token is None:
           start_token = tokenizer.model_max_length
           
       token_starts.append(start_token)
       
       # Handle edge case for end position with backoff strategy
       offset = 1
       while end_token is None:
           adjusted_pos = answer_info['answer_end'] - offset
           end_token = encoded_data.char_to_token(idx, adjusted_pos)
           offset += 1
           
       token_ends.append(end_token)
   
   # Update encoding object with token positions
   position_data = {
       'start_positions': token_starts,
       'end_positions': token_ends
   }
   encoded_data.update(position_data)

# Process token positions for both datasets
map_char_to_token_positions(train_encodings, train_answers)
map_char_to_token_positions(val_encodings, val_answers)

In [6]:
class QuestionAnsweringDataset(torch.utils.data.Dataset):
   """
   Dataset wrapper for question answering task that converts encodings to tensors.
   Handles batched access to encoded QA examples.
   """
   
   def __init__(self, encoded_features):
       """
       Initialize dataset with encoded features.
       
       Args:
           encoded_features: Tokenized and encoded input features
       """
       self.features = encoded_features
       
   def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
       """
       Retrieve a single encoded example.
       
       Args:
           index: Position of the example to fetch
           
       Returns:
           Dict mapping feature names to tensor values
       """
       return {
           feature_name: torch.tensor(feature_values[index]) 
           for feature_name, feature_values in self.features.items()
       }
   
   def __len__(self) -> int:
       """
       Get total number of examples in dataset.
       
       Returns:
           Dataset size based on number of input sequences
       """
       return len(self.features.input_ids)

# Create dataset objects
train_dataset = QuestionAnsweringDataset(train_encodings)
val_dataset = QuestionAnsweringDataset(val_encodings)

# Initialize the model
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

def configure_training_environment(model):
  """Set up computing device and model state"""
  computing_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(computing_device)
  model.train()
  return computing_device

def initialize_training_components(model, dataset):
  """Configure optimizer and data loader"""
  LEARNING_RATE = 2e-6
  BATCH_SIZE = 16
  
  optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
  training_loader = DataLoader(
      dataset, 
      batch_size=BATCH_SIZE, 
      shuffle=True
  )
  return optimizer, training_loader

def train_model(model, train_dataset, num_epochs=10):
  """Execute training loop for question answering model"""
  # Setup training environment
  device = configure_training_environment(model)
  optimizer, data_loader = initialize_training_components(model, train_dataset)
  
  # Training iterations
  for epoch_num in range(num_epochs):
      model.train()
      batch_progress = tqdm(
          data_loader, 
          desc=f'Epoch {epoch_num}', 
          leave=True
      )
      
      for batch_data in batch_progress:
          # Prepare batch
          optimizer.zero_grad()
          
          # Transfer batch to device
          batch_tensors = {
              'input_ids': batch_data['input_ids'].to(device),
              'attention_mask': batch_data['attention_mask'].to(device),
              'start_positions': batch_data['start_positions'].to(device),
              'end_positions': batch_data['end_positions'].to(device)
          }
          
          # Model forward pass
          model_output = model(
              input_ids=batch_tensors['input_ids'],
              attention_mask=batch_tensors['attention_mask'],
              start_positions=batch_tensors['start_positions'],
              end_positions=batch_tensors['end_positions']
          )
          
          # Optimization step
          training_loss = model_output[0]
          training_loss.backward()
          optimizer.step()
          
          # Progress tracking
          batch_progress.set_postfix(loss=training_loss.item())

# Execute training
train_model(model, train_dataset)

Epoch 0: 100%|██████████| 2320/2320 [1:20:17<00:00,  2.08s/it, loss=2.52]
Epoch 1: 100%|██████████| 2320/2320 [1:20:32<00:00,  2.08s/it, loss=2.83]
Epoch 2: 100%|██████████| 2320/2320 [1:20:35<00:00,  2.08s/it, loss=2.56]
Epoch 3: 100%|██████████| 2320/2320 [1:20:38<00:00,  2.09s/it, loss=2.11]
Epoch 4: 100%|██████████| 2320/2320 [1:20:33<00:00,  2.08s/it, loss=2.3]  
Epoch 5: 100%|██████████| 2320/2320 [1:20:17<00:00,  2.08s/it, loss=1.37] 
Epoch 6: 100%|██████████| 2320/2320 [1:20:21<00:00,  2.08s/it, loss=0.583]
Epoch 7: 100%|██████████| 2320/2320 [1:20:18<00:00,  2.08s/it, loss=1.14] 
Epoch 8: 100%|██████████| 2320/2320 [1:20:10<00:00,  2.07s/it, loss=1.89] 
Epoch 9: 100%|██████████| 2320/2320 [1:20:03<00:00,  2.07s/it, loss=0.803]


In [8]:
import os
if not os.path.exists('../models'):
   os.makedirs('../models')
model_path = 'models/distilbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('models/distilbert-custom\\tokenizer_config.json',
 'models/distilbert-custom\\special_tokens_map.json',
 'models/distilbert-custom\\vocab.txt',
 'models/distilbert-custom\\added_tokens.json',
 'models/distilbert-custom\\tokenizer.json')

In [None]:
model = DistilBertForQuestionAnswering.from_pretrained("models/distilbert-custom")
model.to('cuda')

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
     

In [None]:
# Set model to evaluation mode
model.eval()

# Initialize data pipeline
evaluation_dataloader = DataLoader(val_dataset, batch_size=16)
accuracy_scores = []
predicted_spans = []
ground_truth_spans = []

# Iterate through batches
for sample in tqdm(evaluation_dataloader):
    with torch.no_grad():
        # Extract batch data
        x = sample['input_ids'].to(device)
        mask = sample['attention_mask'].to(device)
        true_start = sample['start_positions'].to(device)
        true_end = sample['end_positions'].to(device)
        
        # Forward pass
        model_output = model(x, attention_mask=mask)
        
        # Extract predictions
        predicted_start = torch.argmax(model_output['start_logits'], dim=1)
        predicted_end = torch.argmax(model_output['end_logits'], dim=1)
        
        # Compute batch accuracy
        start_acc = ((predicted_start == true_start).sum() / len(predicted_start)).item()
        end_acc = ((predicted_end == true_end).sum() / len(predicted_end)).item()
        accuracy_scores.extend([start_acc, end_acc])
        
        # Process each sequence in batch
        for idx in range(predicted_start.shape[0]):
            tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'][idx])
            
            # Extract predicted and true spans
            pred_span = ' '.join(tokens[predicted_start[idx]:predicted_end[idx] + 1])
            true_span = ' '.join(tokens[true_start[idx]:true_end[idx] + 1])
            
            # Convert predicted span back to clean text
            pred_token_ids = tokenizer.convert_tokens_to_ids(pred_span.split())
            cleaned_pred = tokenizer.decode(pred_token_ids)
            
            # Store results
            predicted_spans.append(cleaned_pred)
            ground_truth_spans.append(true_span)

100%|██████████| 993/993 [13:13<00:00,  1.25it/s]


In [8]:
from __future__ import print_function
import re
import string
from collections import Counter

def clean_text(text):
   """Standardizes input text by removing articles, extra whitespace, punctuation and converting to lowercase"""
   
   def strip_articles(txt):
       return re.sub(r'\b(a|an|the)\b', ' ', txt)
   
   def standardize_spacing(txt): 
       return ' '.join(txt.split())
   
   def strip_punctuation(txt):
       punct_set = set(string.punctuation)
       return ''.join(c for c in txt if c not in punct_set)
   
   text = text.lower()
   text = strip_punctuation(text)
   text = strip_articles(text)
   text = standardize_spacing(text)
   return text

def check_exact_match(pred, target):
   """Checks if prediction exactly matches target after normalization"""
   return clean_text(pred) == clean_text(target)

def get_best_score(scoring_func, pred, targets):
   """Returns highest score when comparing prediction against multiple ground truth targets"""
   scores = [scoring_func(pred, target) for target in targets]
   return max(scores) if scores else 0

def calculate_f1(pred, target):
   """Calculates F1 score between prediction and target"""
   pred_toks = clean_text(pred).split()
   target_toks = clean_text(target).split()
   
   overlap = Counter(pred_toks) & Counter(target_toks)
   matches = sum(overlap.values())
   
   if matches == 0:
       return 0
       
   prec = matches / len(pred_toks)
   rec = matches / len(target_toks)
   f1 = 2 * (prec * rec) / (prec + rec)
   return f1

def compute_metrics(targets, preds):
   """Computes evaluation metrics (exact match and F1) for predictions"""
   total_examples = 0
   total_f1 = 0
   total_exact = 0
   
   for target, pred in zip(targets, preds):
       total_examples += 1
       total_exact += get_best_score(check_exact_match, pred, [target]) 
       total_f1 += get_best_score(calculate_f1, pred, [target])

   f1_score = 100.0 * total_f1 / total_examples
   
   return {'f1': f1_score}

In [None]:
compute_metrics(ground_truth_spans,predicted_spans)

{'f1': 52.53881732602788}