In [1]:
%%capture
!pip install transformers

In [2]:
pip install pandas



In [3]:
%%capture
import json
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForQuestionAnswering
import time

In [4]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Display the device being used
if device.type == "cuda":
    print("CUDA is enabled. Running on GPU.")
else:
    print("CUDA not available. Running on CPU.")


CUDA is enabled. Running on GPU.


In [5]:
%%capture
!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

In [6]:
import json

# Define the file path for SQuAD 2.0 training data
path = "squad/train-v2.0.json"

# Function to load and preprocess SQuAD 2.0 data
def load_squad_data(file_path, sample_limit=22):
    with open(file_path, 'r') as file:
        squad_data = json.load(file)

    contexts, questions, answers = [], [], []
    sample_count = 0

    for topic in squad_data['data']:
        for paragraph in topic['paragraphs']:
            context_text = paragraph['context']
            for qa in paragraph['qas']:
                if not qa.get("is_impossible", False):  # Include only answerable questions
                    question_text = qa['question']
                    first_answer = qa['answers'][0]  # Select the first available answer
                    contexts.append(context_text)
                    questions.append(question_text)
                    answers.append(first_answer)
                    sample_count += 1
                    if sample_count >= sample_limit:  # Limit the number of samples
                        return contexts, questions, answers

    return contexts, questions, answers


In [7]:
import json
from pathlib import Path

# Path to SQuAD 2.0 validation data
path = Path('squad/dev-v2.0.json')

# Function to load and preprocess SQuAD 2.0 validation data
def load_squad_data(file_path, sample_limit=22):
    """
    Load and preprocess the SQuAD 2.0 dataset.

    Args:
        file_path (Path or str): Path to the SQuAD 2.0 JSON file.
        sample_limit (int): Number of samples to extract. Default is 22.

    Returns:
        tuple: Lists of contexts, questions, and answers.
    """
    with open(file_path, 'r') as file:
        squad_data = json.load(file)

    contexts, questions, answers = [], [], []
    sample_count = 0

    for topic in squad_data['data']:
        for paragraph in topic['paragraphs']:
            context_text = paragraph['context']
            for qa in paragraph['qas']:
                if not qa.get("is_impossible", False):  # Include only answerable questions
                    question_text = qa['question']
                    first_answer = qa['answers'][0]  # Select the first available answer
                    contexts.append(context_text)
                    questions.append(question_text)
                    answers.append(first_answer)
                    sample_count += 1
                    if sample_count >= sample_limit:  # Limit the number of samples
                        return contexts, questions, answers

    return contexts, questions, answers

# Load validation data
val_texts, val_queries, val_answers = load_squad_data(path, sample_limit=22)

# Display results for debugging (optional)
print("Sample Contexts:", val_texts[:2])
print("Sample Questions:", val_queries[:2])
print("Sample Answers:", val_answers[:2])


Sample Contexts: ['The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.', 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raid

In [8]:
# Extract 22 samples from the SQuAD 2.0 dataset
train_texts, train_queries, train_answers = load_squad_data(path, sample_limit=22)

# Display the number of extracted samples
print(f"Extracted {len(train_texts)} samples.\n")

# Display the first 22 samples for verification
for i in range(len(train_texts)):
    print(f"Sample {i+1}:")
    print(f"Context: {train_texts[i][:150]}...")  # Truncated for readability
    print(f"Question: {train_queries[i]}")
    print(f"Answer: {train_answers[i]['text']}")
    print("-" * 80)


Extracted 22 samples.

Sample 1:
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a r...
Question: In what country is Normandy located?
Answer: France
--------------------------------------------------------------------------------
Sample 2:
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a r...
Question: When were the Normans in Normandy?
Answer: 10th and 11th centuries
--------------------------------------------------------------------------------
Sample 3:
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a r...
Question: From which countries did the Norse originate?
Answer: Denmark, Iceland and Norway
-----------------------------------------------------------------

In [9]:
import json
from pathlib import Path

# Define paths to the SQuAD datasets
train_path = Path('squad/train-v2.0.json')
val_path = Path('squad/dev-v2.0.json')

# Function to load SQuAD data and filter duplicate (context, question) pairs
def load_squad_data(file_path, exclude=None, sample_limit=22):
    """
    Load and preprocess SQuAD data, optionally excluding specified samples.

    Args:
        file_path (Path or str): Path to the SQuAD JSON file.
        exclude (set, optional): Set of (context, question) pairs to exclude.
        sample_limit (int): Number of samples to load. Default is 22.

    Returns:
        tuple: Lists of contexts, questions, and answers.
    """
    with open(file_path, 'r') as file:
        squad_data = json.load(file)

    contexts, questions, answers = [], [], []
    sample_count = 0

    for topic in squad_data['data']:
        for paragraph in topic['paragraphs']:
            context_text = paragraph['context']
            for qa in paragraph['qas']:
                if not qa.get("is_impossible", False):  # Include only answerable questions
                    question_text = qa['question']
                    first_answer = qa['answers'][0]  # Select the first answer

                    # Skip duplicates if an exclude set is provided
                    if exclude and (context_text, question_text) in exclude:
                        continue

                    contexts.append(context_text)
                    questions.append(question_text)
                    answers.append(first_answer)
                    sample_count += 1
                    if sample_count >= sample_limit:  # Stop after reaching the sample limit
                        return contexts, questions, answers

    return contexts, questions, answers

# Load training data (22 samples)
train_texts, train_queries, train_answers = load_squad_data(train_path, sample_limit=22)

# Create a set of (context, question) pairs from the training data
train_set = set(zip(train_texts, train_queries))

# Load validation data excluding training samples (22 samples)
val_texts, val_queries, val_answers = load_squad_data(val_path, exclude=train_set, sample_limit=22)

# Display extracted data statistics
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

# Optionally display some samples for debugging
for i in range(2):  # Adjust range to see more/less
    print(f"Training Sample {i+1}:")
    print(f"Context: {train_texts[i][:150]}...")
    print(f"Question: {train_queries[i]}")
    print(f"Answer: {train_answers[i]['text']}")
    print("-" * 80)

for i in range(2):  # Adjust range to see more/less
    print(f"Validation Sample {i+1}:")
    print(f"Context: {val_texts[i][:150]}...")
    print(f"Question: {val_queries[i]}")
    print(f"Answer: {val_answers[i]['text']}")
    print("-" * 80)


Training samples: 22
Validation samples: 22
Training Sample 1:
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Bor...
Question: When did Beyonce start becoming popular?
Answer: in the late 1990s
--------------------------------------------------------------------------------
Training Sample 2:
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Bor...
Question: What areas did Beyonce compete in when she was growing up?
Answer: singing and dancing
--------------------------------------------------------------------------------
Validation Sample 1:
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a r...
Question: In what country is Normandy located?
Answer: France
----------

In [11]:
import json

# Define the path to the SQuAD 2.0 training data
path = "squad/train-v2.0.json"

# Function to load and preprocess the SQuAD 2.0 dataset into a structured format
def load_squad_samples(file_path, sample_limit=22):
    """
    Load and preprocess the SQuAD 2.0 dataset.

    Args:
        file_path (str): Path to the SQuAD 2.0 JSON file.
        sample_limit (int): Maximum number of samples to extract.

    Returns:
        list: A list of dictionaries, each containing a context and its questions with answers.
    """
    with open(file_path, 'r') as file:
        squad_data = json.load(file)

    samples = []  # List to store structured samples
    total_questions = 0  # Counter for the total number of questions added

    for topic in squad_data['data']:
        for paragraph in topic['paragraphs']:
            context_text = paragraph['context']
            questions = []  # Collect questions and their answers for this context

            for qa in paragraph['qas']:
                if not qa.get("is_impossible", False):  # Include only answerable questions
                    question_text = qa['question']
                    answer_text = qa['answers'][0]['text']  # Use the first answer
                    questions.append({
                        "question": question_text,
                        "answer": answer_text
                    })
                    total_questions += 1
                    if total_questions >= sample_limit:  # Stop after reaching the limit
                        samples.append({"context": context_text, "questions": questions})
                        return samples

            # Append the context and its questions if questions exist
            if questions:
                samples.append({"context": context_text, "questions": questions})

    return samples

# Extract 22 samples
samples = load_squad_samples(path, sample_limit=22)

# Display the number of extracted samples and some examples for verification
print(f"Extracted {len(samples)} context samples.")
for i, sample in enumerate(samples[:2]):  # Display first 2 samples for verification
    print(f"Sample {i+1}:")
    print(f"Context: {sample['context'][:150]}...")  # Truncated context for readability
    for q in sample['questions']:
        print(f"  Q: {q['question']}")
        print(f"  A: {q['answer']}")
    print("-" * 80)


Extracted 2 context samples.
Sample 1:
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Bor...
  Q: When did Beyonce start becoming popular?
  A: in the late 1990s
  Q: What areas did Beyonce compete in when she was growing up?
  A: singing and dancing
  Q: When did Beyonce leave Destiny's Child and become a solo singer?
  A: 2003
  Q: In what city and state did Beyonce  grow up? 
  A: Houston, Texas
  Q: In which decade did Beyonce become famous?
  A: late 1990s
  Q: In what R&B group was she the lead singer?
  A: Destiny's Child
  Q: What album made her a worldwide known artist?
  A: Dangerously in Love
  Q: Who managed the Destiny's Child group?
  A: Mathew Knowles
  Q: When did Beyoncé rise to fame?
  A: late 1990s
  Q: What role did Beyoncé have in Destiny's Child?
  A: lead singer
  Q: What was the first album Beyoncé released as a solo artist?
  A: Dangerously in Love
  Q: 

In [12]:
print(len(train_texts))
print(len(train_queries))
print(len(train_answers))

22
22
22


In [13]:
print("Passage: ",train_texts[0])
print("Query: ",train_queries[0])
print("Answer: ",train_answers[0])

Passage:  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Query:  When did Beyonce start becoming popular?
Answer:  {'text': 'in the late 1990s', 'answer_start': 269}


In [14]:
print(len(val_texts))
print(len(val_queries))
print(len(val_answers))

22
22
22


In [15]:
print("Passage: ",val_texts[0])
print("Query: ",val_queries[0])
print("Answer: ",val_answers[0])

Passage:  The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
Query:  In what country is Normandy located?
Answer:  {'text': 'France', 'answer_start': 159}


In [16]:
# Align answer indices with the context text
for answer, text in zip(train_answers, train_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    # Calculate the potential end index
    end_idx = start_idx + len(real_answer)

    # Check and adjust indices if there are misalignments
    if text[start_idx:end_idx] == real_answer:
        # Perfect match, assign the end index
        answer['answer_end'] = end_idx
    elif text[start_idx-1:end_idx-1] == real_answer:
        # Adjust for an off-by-one alignment issue
        answer['answer_start'] = start_idx - 1
        answer['answer_end'] = end_idx - 1
    elif text[start_idx-2:end_idx-2] == real_answer:
        # Adjust for an off-by-two alignment issue
        answer['answer_start'] = start_idx - 2
        answer['answer_end'] = end_idx - 2
    else:
        # Log or handle the case where the real answer doesn't match
        print(f"Warning: Unable to align answer '{real_answer}' in the context.")


In [17]:
# Align answer indices for validation set
for answer, text in zip(val_answers, val_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    # Calculate the intended end index
    end_idx = start_idx + len(real_answer)

    # Check for exact match and adjust indices as needed
    if text[start_idx:end_idx] == real_answer:
        # Perfect match
        answer['answer_end'] = end_idx
    elif start_idx > 0 and text[start_idx-1:end_idx-1] == real_answer:
        # Off-by-one issue
        answer['answer_start'] = start_idx - 1
        answer['answer_end'] = end_idx - 1
    elif start_idx > 1 and text[start_idx-2:end_idx-2] == real_answer:
        # Off-by-two issue
        answer['answer_start'] = start_idx - 2
        answer['answer_end'] = end_idx - 2
    else:
        # Handle misalignment cases for debugging
        print(f"Warning: Could not align the answer '{real_answer}' with the context.")
        print(f"Context snippet: '{text[max(0, start_idx-10):min(len(text), end_idx+10)]}'")


In [18]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [19]:
train_encodings = tokenizer(train_texts, train_queries, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

In [20]:
def add_token_positions(encodings, answers):
    """
    Add token positions for the start and end of answers in tokenized encodings.

    Args:
        encodings: Tokenized input encodings.
        answers: List of dictionaries containing 'answer_start' and 'answer_end'.

    Updates:
        Adds 'start_positions' and 'end_positions' to the encodings dictionary.
    """
    start_positions = []
    end_positions = []

    truncated_count = 0  # Counter for truncated answers

    for i in range(len(answers)):
        # Convert character positions to token positions
        start_pos = encodings.char_to_token(i, answers[i]['answer_start'])
        end_pos = encodings.char_to_token(i, answers[i]['answer_end'])

        # Handle cases where the start position is None (answer passage truncated)
        if start_pos is None:
            start_pos = tokenizer.model_max_length  # Set to max token length

        # Handle cases where the end position is None
        if end_pos is None:
            # Try adjusting by -1 in case it's pointing to the space after the correct token
            end_pos = encodings.char_to_token(i, answers[i]['answer_end'] - 1)

            # If still None, assume the passage is truncated
            if end_pos is None:
                truncated_count += 1
                end_pos = tokenizer.model_max_length

        # Append the adjusted positions
        start_positions.append(start_pos)
        end_positions.append(end_pos)

    # Log the number of truncated answers for debugging
    print(f"Number of truncated answers: {truncated_count}")

    # Update the encodings with start and end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Apply the function to training and validation data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)


Number of truncated answers: 0
Number of truncated answers: 0


In [21]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [22]:

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [23]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

In [24]:
device = torch.device('cuda:0' if torch.cuda.is_available()
                      else 'cpu')

In [25]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [26]:
from transformers import AdamW
optim = AdamW(model.parameters(), lr=5e-5)

epochs = 4



In [27]:
import time
import torch

# Measure total training and evaluation time
whole_train_eval_time = time.time()

# Lists to store training and validation losses
train_losses = []
val_losses = []

# Print progress every 'print_every' batches
print_every = 1000

for epoch in range(epochs):
    epoch_time = time.time()  # Measure time for each epoch

    # Train Mode
    model.train()
    train_loss = 0

    print(f"Epoch {epoch+1} - Training")

    for batch_idx, batch in enumerate(train_loader):
        optim.zero_grad()  # Clear gradients

        # Move inputs and labels to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]

        # Backward pass and optimizer step
        loss.backward()
        optim.step()

        # Accumulate batch loss
        train_loss += loss.item()

        # Print progress for every 'print_every' batches
        if (batch_idx + 1) % print_every == 0:
            print(f"Batch {batch_idx + 1}/{len(train_loader)} - Loss: {round(loss.item(), 3)}")

    # Calculate average training loss for the epoch
    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    # Evaluation Mode
    model.eval()
    val_loss = 0

    print(f"Epoch {epoch+1} - Evaluation")

    for batch_idx, batch in enumerate(val_loader):
        with torch.no_grad():  # Disable gradient calculations
            # Move inputs and labels to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs[0]

            # Accumulate batch loss
            val_loss += loss.item()

        # Print progress for every 'print_every' batches
        if (batch_idx + 1) % print_every == 0:
            print(f"Batch {batch_idx + 1}/{len(val_loader)} - Loss: {round(loss.item(), 3)}")

    # Calculate average validation loss for the epoch
    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    # Print epoch summary
    print(f"\nEpoch {epoch+1} Summary:"
          f"\nTraining Loss: {train_loss:.4f}"
          f"\nValidation Loss: {val_loss:.4f}"
          f"\nEpoch Time: {time.time() - epoch_time:.2f}s\n"
          "---------------------------------------")

# Print total training and evaluation time
print(f"Total training and evaluation time: {time.time() - whole_train_eval_time:.2f}s")


Epoch 1 - Training
Epoch 1 - Evaluation

Epoch 1 Summary:
Training Loss: 5.3216
Validation Loss: 5.1065
Epoch Time: 3.11s
---------------------------------------
Epoch 2 - Training
Epoch 2 - Evaluation

Epoch 2 Summary:
Training Loss: 4.4374
Validation Loss: 4.9369
Epoch Time: 1.29s
---------------------------------------
Epoch 3 - Training
Epoch 3 - Evaluation

Epoch 3 Summary:
Training Loss: 3.7501
Validation Loss: 4.7801
Epoch Time: 1.30s
---------------------------------------
Epoch 4 - Training
Epoch 4 - Evaluation

Epoch 4 Summary:
Training Loss: 3.2349
Validation Loss: 4.6375
Epoch Time: 1.30s
---------------------------------------
Total training and evaluation time: 7.00s


In [28]:
import torch
torch.save(model, 'bert_finetuned_model.pth')
print("Model saved as 'bert_finetuned_model.pth'.")

bert_model2 = torch.load('bert_finetuned_model.pth', map_location=torch.device('cpu'))
bert_model2.eval()
print("Full fine-tuned model loaded successfully.")

Model saved as 'bert_finetuned_model.pth'.


  bert_model2 = torch.load('bert_finetuned_model.pth', map_location=torch.device('cpu'))


Full fine-tuned model loaded successfully.


In [29]:
torch.save(model, 'bert_finetuned_model.pth')
print("Model saved as 'bert_finetuned_model.pth'")

Model saved as 'bert_finetuned_model.pth'


In [30]:
bert_model2 = torch.load('bert_finetuned_model.pth', map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
bert_model2.eval()  # Switch to evaluation mode
print("Full fine-tuned model loaded successfully.")

  bert_model2 = torch.load('bert_finetuned_model.pth', map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))


Full fine-tuned model loaded successfully.


In [33]:
import torch
import re
import string

def predict(context, query):
    """
    Generate a refined prediction from the model for a given context and query.

    Args:
        context (str): The context text.
        query (str): The query text.

    Returns:
        str: The refined predicted answer.
    """
    # Tokenize the inputs
    inputs = tokenizer.encode_plus(query, context, return_tensors='pt')

    # Ensure inputs are on the same device as the model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Perform inference
    outputs = model(**inputs)
    answer_start = torch.argmax(outputs[0])  # Start position
    answer_end = torch.argmax(outputs[1]) + 1  # End position

    # Ensure valid prediction
    if answer_start >= answer_end:
        return ""

    # Decode the tokens for the predicted answer
    predicted_tokens = inputs['input_ids'][0][answer_start:answer_end]

    # Decode into text, ensuring removal of special tokens
    prediction = tokenizer.decode(predicted_tokens, skip_special_tokens=True).strip()

    # Post-process the prediction to remove trailing context
    prediction = prediction.split(".")[0].strip()  # Stop at the first period if present

    return prediction



def normalize_text(s):
    """
    Normalize text by removing articles, punctuation, and extra whitespace.

    Args:
        s (str): The input text.

    Returns:
        str: The normalized text.
    """
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_exact_match(prediction, truth):
    """
    Compute the exact match (EM) score between a prediction and the ground truth.

    Args:
        prediction (str): The predicted answer.
        truth (str): The ground truth answer.

    Returns:
        int: 1 if the prediction matches the truth exactly, 0 otherwise.
    """
    return int(normalize_text(prediction) == normalize_text(truth))


def compute_f1(prediction, truth):
    """
    Compute the F1 score between a prediction and the ground truth.

    Args:
        prediction (str): The predicted answer.
        truth (str): The ground truth answer.

    Returns:
        float: The F1 score (0 to 1).
    """
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # Handle no-answer cases
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    # Compute token overlap
    common_tokens = set(pred_tokens) & set(truth_tokens)

    # No common tokens results in F1 = 0
    if len(common_tokens) == 0:
        return 0.0

    # Precision and recall
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)

    # F1 score
    return 2 * (precision * recall) / (precision + recall)


# Example Usage
context = "The capital of France is Paris. It is known for the Eiffel Tower."
query = "What is the capital of France?"
true_answer = "Paris"

# Generate prediction
predicted_answer = predict(context, query)

# Evaluate the prediction
em_score = compute_exact_match(predicted_answer, true_answer)
f1_score = compute_f1(predicted_answer, true_answer)

print(f"Context: {context}")
print(f"Query: {query}")
print(f"Prediction: {predicted_answer}")
print(f"True Answer: {true_answer}")
print(f"Exact Match Score: {em_score}")
print(f"F1 Score: {f1_score}")


Context: The capital of France is Paris. It is known for the Eiffel Tower.
Query: What is the capital of France?
Prediction: paris
True Answer: Paris
Exact Match Score: 1
F1 Score: 1.0


In [34]:
def give_an_answer(context,query,answer):
    prediction = predict(context,query)
    em_score = compute_exact_match(prediction, answer)
    f1_score = compute_f1(prediction, answer)
    print(f"Question: {query}")
    print(f"Prediction: {prediction}")
    print(f"True Answer: {answer}")
    print(f"EM: {em_score}")
    print(f"F1: {f1_score}")
    print("\n")

In [35]:
from transformers import BertForQuestionAnswering
import torch

# Reinitialize the model
bert_model2 = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
bert_model2 = torch.load('bert_finetuned_model.pth', map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
# Set the model to evaluation mode
bert_model2.eval()
print("Model loaded successfully and is ready for inference.")


  bert_model2 = torch.load('bert_finetuned_model.pth', map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))


Model loaded successfully and is ready for inference.


In [36]:
# Define the bert tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Load the fine-tuned modeol
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model.eval()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,)

In [37]:
def predict(context, query):
    """
    Predicts the answer to a query based on the given context using the model.

    Args:
        context (str): The context containing the information.
        query (str): The question to be answered.

    Returns:
        str: The predicted answer.
    """
    # Tokenize the inputs
    inputs = tokenizer.encode_plus(query, context, return_tensors='pt')

    # Ensure inputs are on the same device as the model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Perform inference
    outputs = model(**inputs)

    # Get the most likely start and end positions
    answer_start = torch.argmax(outputs[0])  # Start position
    answer_end = torch.argmax(outputs[1]) + 1  # End position

    # Decode the predicted answer
    predicted_tokens = inputs['input_ids'][0][answer_start:answer_end]
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(predicted_tokens))

    # Return the cleaned answer
    return answer.strip()


def normalize_text(s):
    """
    Normalizes text by removing articles, punctuation, and extra whitespace.

    Args:
        s (str): The text to normalize.

    Returns:
        str: The normalized text.
    """
    import string
    import re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_exact_match(prediction, truth):
    """
    Computes the Exact Match (EM) score between a prediction and the ground truth.

    Args:
        prediction (str): The predicted answer.
        truth (str): The ground truth answer.

    Returns:
        int: 1 if the prediction matches the ground truth exactly, otherwise 0.
    """
    return int(normalize_text(prediction) == normalize_text(truth))


def compute_f1(prediction, truth):
    """
    Computes the F1 score between a prediction and the ground truth.

    Args:
        prediction (str): The predicted answer.
        truth (str): The ground truth answer.

    Returns:
        float: The F1 score (0 to 1).
    """
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # Handle no-answer cases
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    # Compute token overlap
    common_tokens = set(pred_tokens) & set(truth_tokens)

    # No common tokens results in F1 = 0
    if len(common_tokens) == 0:
        return 0.0

    # Calculate precision and recall
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)

    # Compute F1 score
    return 2 * (precision * recall) / (precision + recall)


In [38]:
def give_an_answer(context,query,answer):
    prediction = predict(context,query)
    em_score = compute_exact_match(prediction, answer)
    f1_score = compute_f1(prediction, answer)
    print(f"Question: {query}")
    print(f"Prediction: {prediction}")
    print(f"True Answer: {answer}")
    print(f"EM: {em_score}")
    print(f"F1: {f1_score}")
    print("\n")

In [39]:
context = """ Harry Potter is a series of seven fantasy novels written by British author, J. K. Rowling. The novels chronicle the lives of a young wizard,
              Harry Potter, and his friends Hermione Granger and Ron Weasley, all of whom are students at Hogwarts School of Witchcraft and Wizardry.
              The main story arc concerns Harry's struggle against Lord Voldemort, a dark wizard who intends to become immortal, overthrow the wizard
              governing body known as the Ministry of Magic and subjugate all wizards and Muggles (non-magical people). Since the release of the first novel,
              Harry Potter and the Philosopher's Stone, on 26 June 1997, the books have found immense popularity, positive reviews, and commercial success worldwide.
              They have attracted a wide adult audience as well as younger readers and are often considered cornerstones of modern young adult literature.[2]
              As of February 2018, the books have sold more than 500 million copies worldwide, making them the best-selling book series in history, and have been translated
              into eighty languages.[3] The last four books consecutively set records as the fastest-selling books in history, with the final installment selling roughly
              eleven million copies in the United States within twenty-four hours of its release.  """

queries = [
           "Who wrote Harry Potter's novels?",
           "Who are Harry Potter's friends?",
           "Who is the enemy of Harry Potter?",
           "What are Muggles?",
           "Which is the name of Harry Poter's first novel?",
           "When did the first novel release?",
           "Who was attracted by Harry Potter novels?",
           "How many languages Harry Potter has been translated into? "
          ]
answers = [
           "J. K. Rowling",
           "Hermione Granger and Ron Weasley",
           "Lord Voldemort",
           "non-magical people",
           "Harry Potter and the Philosopher's Stone",
           "26 June 1997",
           "a wide adult audience as well as younger readers",
           "eighty"
          ]

for q,a in zip(queries,answers):
      give_an_answer(context,q,a)

Question: Who wrote Harry Potter's novels?
Prediction: j. k. rowling
True Answer: J. K. Rowling
EM: 1
F1: 1.0


Question: Who are Harry Potter's friends?
Prediction: hermione granger and ron weasley
True Answer: Hermione Granger and Ron Weasley
EM: 1
F1: 1.0


Question: Who is the enemy of Harry Potter?
Prediction: lord voldemort
True Answer: Lord Voldemort
EM: 1
F1: 1.0


Question: What are Muggles?
Prediction: non - magical people
True Answer: non-magical people
EM: 0
F1: 0.4


Question: Which is the name of Harry Poter's first novel?
Prediction: harry potter and the philosopher ' s stone
True Answer: Harry Potter and the Philosopher's Stone
EM: 0
F1: 0.7272727272727272


Question: When did the first novel release?
Prediction: 26 june 1997
True Answer: 26 June 1997
EM: 1
F1: 1.0


Question: Who was attracted by Harry Potter novels?
Prediction: wide adult audience as well as younger readers
True Answer: a wide adult audience as well as younger readers
EM: 1
F1: 0.875


Question: How m

In [40]:
# Save model to Google Drive
torch.save(model, 'bert_finetunedmodel1.pth')

In [41]:
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

# Load the tokenizer and the model
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model.eval()


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,)

In [42]:
def predict(context, question):

    # Tokenize the input
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")

    # Ensure inputs are on the same device as the model
    device = next(model.parameters()).device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        answer_start = torch.argmax(outputs.start_logits)  # Most likely start of the answer
        answer_end = torch.argmax(outputs.end_logits) + 1  # Most likely end of the answer

    # Ensure valid prediction
    if answer_start >= answer_end:
        return ""  # Return an empty string if start >= end (invalid prediction)

    # Decode the answer
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(answer_tokens))

    # Clean and return the predicted answer
    return answer.strip()


In [44]:
# Load the fine-tuned model and tokenizer
from transformers import BertTokenizerFast, BertForQuestionAnswering
import torch

# Load the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Set the model to evaluation mode and move it to the appropriate device
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prediction function
def predict(context, question):
    """
    Predicts the answer to a question based on the given context using the model.
    """
    # Tokenize and encode inputs
    encoding = tokenizer.encode_plus(question, context, return_tensors='pt').to(device)

    # Perform inference
    with torch.no_grad():
        outputs = model(**encoding)
        start_idx = torch.argmax(outputs.start_logits)
        end_idx = torch.argmax(outputs.end_logits) + 1

    # Decode and clean the predicted answer
    answer = tokenizer.decode(encoding['input_ids'][0][start_idx:end_idx])
    return answer.strip()

# Exact Match computation
def compute_exact_match(prediction, ground_truth):
    """
    Computes the Exact Match (EM) score between a prediction and the ground truth.
    """
    return int(prediction.strip().lower() == ground_truth.strip().lower())

# F1 Score computation
def compute_f1(prediction, ground_truth):
    """
    Computes the F1 score between a prediction and the ground truth.
    """
    pred_tokens = prediction.strip().lower().split()
    truth_tokens = ground_truth.strip().lower().split()

    # Handle cases with no common tokens
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if not common_tokens:
        return 0.0

    # Calculate precision, recall, and F1 score
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)
    return 2 * (precision * recall) / (precision + recall)

# Iterate over samples to evaluate the model
question_number = 1  # Initialize question number
total_f1 = 0  # Accumulate F1 scores
num_questions = 0  # Count the number of questions

for sample in samples:
    # Print the context (truncated for readability)
    print(f"\nContext: {sample['context'][:150]}...\n")

    for q in sample['questions']:
        # Generate prediction
        prediction = predict(sample["context"], q["question"])

        # Calculate metrics
        exact_match = compute_exact_match(prediction, q["answer"])
        f1_score = compute_f1(prediction, q["answer"])

        # Print results for this question
        print(f"Question {question_number}: {q['question']}")
        print(f"Prediction: {prediction}")
        print(f"Ground Truth: {q['answer']}")
        print(f"Exact Match: {exact_match}")
        print(f"F1 Score: {f1_score:.2f}")
        print("-" * 50)

        # Update totals
        total_f1 += f1_score
        num_questions += 1
        question_number += 1

# Calculate and print the average F1 score
average_f1 = total_f1 / num_questions if num_questions > 0 else 0
print(f"\nAverage F1 Score: {average_f1:.2f}")



Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Bor...

Question 1: When did Beyonce start becoming popular?
Prediction: late 1990s
Ground Truth: in the late 1990s
Exact Match: 0
F1 Score: 0.67
--------------------------------------------------
Question 2: What areas did Beyonce compete in when she was growing up?
Prediction: singing and dancing
Ground Truth: singing and dancing
Exact Match: 1
F1 Score: 1.00
--------------------------------------------------
Question 3: When did Beyonce leave Destiny's Child and become a solo singer?
Prediction: 2003
Ground Truth: 2003
Exact Match: 1
F1 Score: 1.00
--------------------------------------------------
Question 4: In what city and state did Beyonce  grow up? 
Prediction: houston
Ground Truth: Houston, Texas
Exact Match: 0
F1 Score: 0.00
--------------------------------------------------
Question 5: In which decade did Beyonce become

As observed, the average F1 score is 85%. This performance can be further improved by incorporating advanced preprocessing techniques and leveraging larger, high-quality datasets. Enhanced preprocessing steps, such as better text normalization, token alignment, and handling edge cases, can lead to more accurate predictions.