In [6]:
# # Install necessary libraries
!pip install datasets transformers torch evaluate numpy scikit-learn sentence-transformers accelerate
!pip install nltk



In [7]:
# Import necessary libraries
import json
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW
from sklearn.model_selection import train_test_split
from evaluate import load
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from tqdm import tqdm
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
# Load the SQuAD dataset
def load_squad_data(url, sample_size=100):
    """Loads SQuAD data from a URL, processes it, and samples it."""
    try:
        # Use the requests library to get the content from the URL
        import requests
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        squad_data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from URL: {e}")
        return None

    contexts = []
    questions = []
    answers = []

    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                if qa['answers']:
                    contexts.append(paragraph['context'])
                    questions.append(qa['question'])
                    answers.append(qa['answers'][0]['text'])
    # Sample the data
    if sample_size and len(contexts) > sample_size:
        sampled_indices = np.random.choice(len(contexts), sample_size, replace=False)
        contexts = [contexts[i] for i in sampled_indices]
        questions = [questions[i] for i in sampled_indices]
        answers = [answers[i] for i in sampled_indices]
    return contexts, questions, answers

# URL for the SQuAD dataset
url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
sample_size = 100  # Reduced sample size for faster initial runs
contexts, questions, answers = load_squad_data(url, sample_size)
print(f"Loaded {len(contexts)} samples of SQuAD data.")

Loaded 100 samples of SQuAD data.


In [9]:
# prompt: print sample 5 contexts, questions, answers here that loaded from dataset ?
# Print sample contexts, questions, and answers
if contexts and questions and answers:
    num_samples_to_print = 5  # Print 5 samples
    for i in range(min(num_samples_to_print, len(contexts))):
      print(f"Sample {i+1}:")
      print(f"  Context: {contexts[i]}")
      print(f"  Question: {questions[i]}")
      print(f"  Answer: {answers[i]}")
      print("-" * 20)
else:
    print("No data available to print.")

Sample 1:
  Context: The status of the town was changed by a later charter of Charles I by at once the formal separation from Portsmouth and the recognition of Southampton as a county, In the charter dated 27 June 1640 the formal title of the town became 'The Town and County of the Town of Southampton'. These charters and Royal Grants, of which there were many, also set out the governance and regulation of the town and port which remained the 'constitution' of the town until the local government organisation of the later Victorian period which from about 1888 saw the setting up of County Councils across England and Wales and including Hampshire County Council who now took on some of the function of Government in Southampton Town. In this regime, The Town and County of the Town of Southampton also became a county borough with shared responsibility for aspects of local government. On 24 February 1964 the status changed again by a Charter of Elizabeth II, creating the City and County of t

In [10]:
# Sentence Embedding with Sentence Transformer
def get_sentence_embeddings(sentences, model_name='all-mpnet-base-v2'):
    """Generates sentence embeddings using Sentence Transformers."""
    model = SentenceTransformer(model_name)
    embeddings = model.encode(sentences, convert_to_tensor=True)
    return embeddings

# Generate embeddings for contexts and questions
context_embeddings = get_sentence_embeddings(contexts)
question_embeddings = get_sentence_embeddings(questions)
print("Generated embeddings for contexts and questions.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated embeddings for contexts and questions.


In [11]:
from transformers import BertTokenizerFast, BertForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
import torch

# Prepare data for BERT model
class SquadDataset(Dataset):
    def __init__(self, contexts, questions, answers, tokenizer, max_length=512):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]

        encoding = self.tokenizer(question, context,
                                  add_special_tokens=True,
                                  max_length=self.max_length,
                                  padding='max_length',
                                  truncation=True,
                                  return_offsets_mapping=True,
                                  return_tensors='pt')

        start_positions = 0
        end_positions = 0

        offset_mapping = encoding['offset_mapping'][0]
        start_positions = -1
        end_positions = -1

        for i in range(len(offset_mapping)):
          offset_start, offset_end = offset_mapping[i]
          if offset_start <= len(context) and offset_end <=len(context) and context[offset_start:offset_end] == answer:
              start_positions = i
              end_positions = i
              break

        if start_positions == -1 and end_positions == -1 :
          for i in range(len(offset_mapping)):
            offset_start, offset_end = offset_mapping[i]
            if offset_start <= len(context) and offset_end <=len(context) and answer in context[offset_start:offset_end]:
                start_positions = i
                end_positions = i
                break

        encoding['start_positions'] = torch.tensor(start_positions, dtype=torch.long)
        encoding['end_positions'] = torch.tensor(end_positions, dtype=torch.long)

        return {key: val.squeeze() for key, val in encoding.items()}

# Initialize BERT tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') # Changed to BertTokenizerFast
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Create dataset
dataset = SquadDataset(contexts, questions, answers, tokenizer)

# Split dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

print("Data preparation for BERT model complete.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data preparation for BERT model complete.


In [12]:
# Fine-tuning the BERT model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} Training Loss: {avg_train_loss:.4f}")

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
      for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            total_val_loss += loss.item()
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} Validation Loss: {avg_val_loss:.4f}")

print("Fine-tuning of BERT model complete.")

Epoch 1/3 Training: 100%|██████████| 10/10 [00:08<00:00,  1.18it/s]


Epoch 1/3 Training Loss: 4.5292


Epoch 1/3 Validation: 100%|██████████| 3/3 [00:01<00:00,  3.00it/s]


Epoch 1/3 Validation Loss: 2.3621


Epoch 2/3 Training: 100%|██████████| 10/10 [00:08<00:00,  1.25it/s]


Epoch 2/3 Training Loss: 2.2923


Epoch 2/3 Validation: 100%|██████████| 3/3 [00:00<00:00,  3.72it/s]


Epoch 2/3 Validation Loss: 1.7125


Epoch 3/3 Training: 100%|██████████| 10/10 [00:08<00:00,  1.20it/s]


Epoch 3/3 Training Loss: 1.4503


Epoch 3/3 Validation: 100%|██████████| 3/3 [00:00<00:00,  3.63it/s]

Epoch 3/3 Validation Loss: 1.4041
Fine-tuning of BERT model complete.





In [13]:
# Prediction function
def predict_answer(context, question, model, tokenizer):
    model.to(device) # Move the model to the correct device before inference
    model.eval()
    inputs = tokenizer(question, context, add_special_tokens=True, return_tensors="pt",  max_length=512,
                        padding='max_length',
                        truncation=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
    answer = tokens[answer_start : answer_end+1]
    answer = tokenizer.convert_tokens_to_string(answer)
    return answer

# Print answers for 5 context and questions
print("\nPredictions:")
for i in range(min(5, len(contexts))):
    context = contexts[i]
    question = questions[i]
    predicted_answer = predict_answer(context, question, model, tokenizer)
    print(f"Context: {context[:100]}...")
    print(f"Question: {question}")
    print(f"Predicted Answer: {predicted_answer}")
    print("-" * 50)


Predictions:
Context: The status of the town was changed by a later charter of Charles I by at once the formal separation ...
Question: What king's charter recognized Southampton as its own county?
Predicted Answer: [CLS]
--------------------------------------------------
Context: Two distinct viewpoints on time divide many prominent philosophers. One view is that time is part of...
Question: How many main viewpoints divide many philosophers?
Predicted Answer: [CLS]
--------------------------------------------------
Context: Music is cherished in Boston. The Boston Symphony Orchestra is one of the "Big Five," a group of the...
Question: What classical music magazine called the Boston Symphony orchestra one of the worlds best orchestras?
Predicted Answer: [CLS]
--------------------------------------------------
Context: Spirometry is recommended to aid in diagnosis and management. It is the single best test for asthma....
Question: What is recommended to help in the diagnosis of asthma

In [14]:
def evaluate_model(model, tokenizer, val_dataloader, device):
    metric = load("squad_v2")
    model.eval()
    for batch in tqdm(val_dataloader, desc="Evaluating"):
      with torch.no_grad():
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_positions = batch['start_positions'].to(device)
          end_positions = batch['end_positions'].to(device)
          outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

          start_logits = outputs.start_logits.detach().cpu().numpy()
          end_logits = outputs.end_logits.detach().cpu().numpy()

      for i in range(len(input_ids)):
        answer_start = torch.argmax(outputs.start_logits[i]).item()
        answer_end = torch.argmax(outputs.end_logits[i]).item()
        tokens = tokenizer.convert_ids_to_tokens(input_ids[i].cpu().numpy())
        predicted_answer_tokens = tokens[answer_start:answer_end+1]
        predicted_answer = tokenizer.convert_tokens_to_string(predicted_answer_tokens)

        true_answer = ""
        offset_mapping = batch['offset_mapping'][i].cpu().numpy()

        # find the answer start and end
        true_start_index = batch['start_positions'][i].item()
        true_end_index = batch['end_positions'][i].item()

        if true_start_index != -1 and true_end_index != -1:
           start_char_pos = offset_mapping[true_start_index][0]
           end_char_pos = offset_mapping[true_end_index][1]
           true_answer = contexts[i][start_char_pos:end_char_pos]

        metric.add(predictions=[{'prediction_text': predicted_answer, 'id': str(i), 'no_answer_probability': 0.0}],
                   references=[{'answers': [{'text': true_answer, 'answer_start': [0]}], 'id': str(i)}])

    eval_results = metric.compute()
    return eval_results

In [15]:
# Load the evaluation script
try:
    import requests
    url = "https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/"
    response = requests.get(url)
    response.raise_for_status()
    evaluation_script = response.text
    print("Evaluation script loaded.")
except requests.exceptions.RequestException as e:
    print(f"Error fetching evaluation script: {e}")
    evaluation_script = None

Evaluation script loaded.


In [16]:
# Introduction from the wikipedia page
context_wiki = """Statistics is the discipline that concerns the collection, organization, analysis, interpretation, and presentation of data. In applying statistics to a scientific, industrial, or societal problem, it is conventional to begin with a statistical population or a statistical model to be studied. Populations can be diverse topics such as "all people living in a country" or "every atom composing a crystal". Statistics deals with all aspects of data, including the planning of data collection in terms of the design of surveys and experiments. When census data cannot be collected, statisticians collect data by developing specific experiment designs and survey samples. Representative sampling assures that inferences and conclusions can reasonably extend from the sample to the population as a whole. Statistics is a mathematical body of science that pertains to the collection, analysis, interpretation or explanation, and presentation of data, or as a branch of mathematics. Some consider statistics to be a distinct mathematical science rather than a branch of mathematics.
Descriptive statistics can be used to summarize the data, for example, the mean or the standard deviation. Inferential statistics can be used when drawing conclusions from the data, that may be subject to random variation.
"""

questions_wiki = [
    "What is statistics?",
    "Where can descriptive statistics be used?",
    "How to draw meaningful conclusions?"
]

# Get answers from the model
print("\nAnswers from Wikipedia Context:")
for question in questions_wiki:
    answer = predict_answer(context_wiki, question, model, tokenizer)
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    print("-" * 50)


Answers from Wikipedia Context:
Question: What is statistics?
Answer: [CLS]
--------------------------------------------------
Question: Where can descriptive statistics be used?
Answer: [CLS]
--------------------------------------------------
Question: How to draw meaningful conclusions?
Answer: [CLS]
--------------------------------------------------


In [17]:
import os
# Load GloVe embeddings
def load_glove_embeddings(file_path):
    """Loads GloVe embeddings from a file and returns a dictionary."""
    embeddings_index = {}
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None
    return embeddings_index

# Download GloVe embeddings if not present
glove_zip_file = 'glove.6B.zip'
glove_file = '/content/glove.6B.50d.txt'

if not os.path.exists(glove_file):
    if not os.path.exists(glove_zip_file):
        print("Downloading GloVe embeddings...")
        !wget http://nlp.stanford.edu/data/glove.6B.zip
    print("Unzipping GloVe embeddings...")
    !unzip glove.6B.zip
else:
    print("GloVe embeddings file already exists.")


# Load GloVe 50d embeddings
glove_embeddings = load_glove_embeddings(glove_file)
if glove_embeddings:
    print(f"Loaded GloVe 50d embeddings from {glove_file}.")
else:
    print("GloVe embeddings could not be loaded.")

# Function to convert a sentence to GloVe embeddings
def sentence_to_glove_embedding(sentence, glove_embeddings, embedding_dim=50):
    words = sentence.lower().split()
    embeddings = []
    for word in words:
        if word in glove_embeddings:
            embeddings.append(glove_embeddings[word])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(embedding_dim)  # Return zero vector if no word is found

# Convert the contexts and questions to GloVe embeddings
if glove_embeddings is not None:
    glove_context_embeddings = np.array([sentence_to_glove_embedding(context, glove_embeddings) for context in contexts])
    glove_question_embeddings = np.array([sentence_to_glove_embedding(question, glove_embeddings) for question in questions])
    print("Converted contexts and questions to GloVe embeddings.")

    # Combine GloVe embeddings with BERT embeddings

    class CombinedDataset(Dataset):
        def __init__(self, contexts, questions, answers, tokenizer, glove_context_embeddings, glove_question_embeddings, max_length=512):
          self.contexts = contexts
          self.questions = questions
          self.answers = answers
          self.tokenizer = tokenizer
          self.max_length = max_length
          self.glove_context_embeddings = glove_context_embeddings
          self.glove_question_embeddings = glove_question_embeddings

        def __len__(self):
            return len(self.contexts)

        def __getitem__(self, idx):
          context = self.contexts[idx]
          question = self.questions[idx]
          answer = self.answers[idx]

          encoding = self.tokenizer(question, context,
                                      add_special_tokens=True,
                                      max_length=self.max_length,
                                      padding='max_length',
                                      truncation=True,
                                      return_offsets_mapping=True,
                                      return_tensors='pt')

          start_positions = 0
          end_positions = 0
          offset_mapping = encoding['offset_mapping'][0]
          start_positions = -1
          end_positions = -1

          # Use the same logic as in SquadDataset to find the start and end positions based on token offsets
          for i in range(len(offset_mapping)):
            offset_start, offset_end = offset_mapping[i]
            if offset_start <= len(context) and offset_end <=len(context) and context[offset_start:offset_end] == answer:
                start_positions = i
                end_positions = i
                break

          if start_positions == -1 and end_positions == -1:
              for i in range(len(offset_mapping)):
                offset_start, offset_end = offset_mapping[i]
                if offset_start <= len(context) and offset_end <=len(context) and answer in context[offset_start:offset_end]:
                    start_positions = i
                    end_positions = i
                    break


          glove_context_embedding = torch.tensor(self.glove_context_embeddings[idx], dtype=torch.float32)
          glove_question_embedding = torch.tensor(self.glove_question_embeddings[idx], dtype=torch.float32)

          encoding['glove_context_embeddings'] = glove_context_embedding
          encoding['glove_question_embeddings'] = glove_question_embedding
          encoding['start_positions'] = torch.tensor(start_positions, dtype=torch.long)
          encoding['end_positions'] = torch.tensor(end_positions, dtype=torch.long)

          return {key: val.squeeze() for key, val in encoding.items()}

    # Create dataset
    combined_dataset = CombinedDataset(contexts, questions, answers, tokenizer, glove_context_embeddings, glove_question_embeddings)

    # Split the dataset into training and validation sets
    train_size = int(0.8 * len(combined_dataset))
    val_size = len(combined_dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(combined_dataset, [train_size, val_size])


    # Create DataLoaders
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
    print("Data preparation with GloVe embeddings complete.")

Downloading GloVe embeddings...
--2025-01-09 18:49:30--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-01-09 18:49:30--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-01-09 18:49:31--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]

In [23]:
class CombinedBERT(torch.nn.Module):
    def __init__(self, bert_model, glove_embedding_dim=50):
        super(CombinedBERT, self).__init__()
        self.bert = bert_model
        self.glove_embedding_dim = glove_embedding_dim
        self.fc_start = torch.nn.Linear(768 + self.glove_embedding_dim, 768)
        self.fc_end = torch.nn.Linear(768 + self.glove_embedding_dim, 768)
        self.start_classifier = torch.nn.Linear(768,1)
        self.end_classifier = torch.nn.Linear(768,1)

    def forward(self, input_ids, attention_mask, glove_context_embeddings, glove_question_embeddings, start_positions=None, end_positions=None):
          outputs = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)

          hidden_state = outputs.hidden_states[-1]

          # Reshape glove embeddings to have the same sequence length as hidden state
          batch_size, seq_length, hidden_dim = hidden_state.shape
          glove_context_embeddings = glove_context_embeddings.unsqueeze(1).expand(-1, seq_length, -1).contiguous()
          glove_question_embeddings = glove_question_embeddings.unsqueeze(1).expand(-1, seq_length, -1).contiguous()

          combined_context_embeddings = torch.cat((hidden_state, glove_context_embeddings),dim=2)

          # Reshape combined embeddings to apply linear layer per token
          batch_size, seq_length, combined_dim = combined_context_embeddings.shape
          reshaped_combined_context_embeddings = combined_context_embeddings.view(batch_size * seq_length, combined_dim).contiguous()

          start_logits = self.fc_start(reshaped_combined_context_embeddings)
          start_logits = self.start_classifier(start_logits).squeeze(-1)
          start_logits = start_logits.view(batch_size, seq_length).contiguous() # reshape back


          end_logits = self.fc_end(reshaped_combined_context_embeddings)
          end_logits = self.end_classifier(end_logits).squeeze(-1)
          end_logits = end_logits.view(batch_size, seq_length).contiguous() # reshape back

          if start_positions is not None and end_positions is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            return {'loss':total_loss, 'start_logits':start_logits, 'end_logits':end_logits}

          return {'start_logits':start_logits, 'end_logits':end_logits}

# Instantiate the model
model_glove = CombinedBERT(model)
model_glove.to(device)
print("Combined BERT model created.")

# Fine tune with the combined embeddings
optimizer_glove = AdamW(model_glove.parameters(), lr=5e-5)
num_epochs = 3

for epoch in range(num_epochs):
    model_glove.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} Training with Glove"):
      optimizer_glove.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      glove_context_embeddings = batch['glove_context_embeddings'].to(device)
      glove_question_embeddings = batch['glove_question_embeddings'].to(device)

      outputs = model_glove(input_ids=input_ids, attention_mask=attention_mask, glove_context_embeddings=glove_context_embeddings, glove_question_embeddings=glove_question_embeddings, start_positions=start_positions, end_positions=end_positions)
      loss = outputs.loss
      total_loss += loss.item()
      loss.backward()
      optimizer_glove.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} Training with Glove Loss: {avg_train_loss:.4f}")

    model_glove.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} Validation with Glove"):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_positions = batch['start_positions'].to(device)
          end_positions = batch['end_positions'].to(device)
          glove_context_embeddings = batch['glove_context_embeddings'].to(device)
          glove_question_embeddings = batch['glove_question_embeddings'].to(device)
          outputs = model_glove(input_ids=input_ids, attention_mask=attention_mask, glove_context_embeddings=glove_context_embeddings, glove_question_embeddings=glove_question_embeddings, start_positions=start_positions, end_positions=end_positions)
          loss = outputs.loss
          total_val_loss += loss.item()
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} Validation with Glove Loss: {avg_val_loss:.4f}")

print("Fine-tuning of combined BERT model with GloVe embeddings complete.")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Prediction function with combined model
def predict_answer_glove(context, question, model, tokenizer, glove_embeddings):
    model.eval()
    glove_context_embedding = sentence_to_glove_embedding(context, glove_embeddings)
    glove_question_embedding = sentence_to_glove_embedding(question, glove_embeddings)
    inputs = tokenizer(question, context, add_special_tokens=True, return_tensors="pt", max_length=512, padding='max_length', truncation=True).to(device)

    glove_context_embedding = torch.tensor(glove_context_embedding, dtype=torch.float32).unsqueeze(0).to(device)
    glove_question_embedding = torch.tensor(glove_question_embedding, dtype=torch.float32).unsqueeze(0).to(device)

    with torch.no_grad():
      outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], glove_context_embeddings=glove_context_embedding, glove_question_embeddings=glove_question_embedding)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze())
    answer = tokens[answer_start : answer_end+1]
    answer = tokenizer.convert_tokens_to_string(answer)
    return answer

# Print answers for 5 context and questions
print("\nPredictions with Glove embeddings:")
for i in range(min(5, len(contexts))):
    context = contexts[i]
    question = questions[i]
    predicted_answer = predict_answer_glove(context, question, model_glove, tokenizer, glove_embeddings)
    print(f"Context: {context[:100]}...")
    print(f"Question: {question}")
    print(f"Predicted Answer: {predicted_answer}")
    print("-" * 50)

In [None]:
# Evaluation function with combined embeddings
def evaluate_model_glove(model, tokenizer, val_dataloader, device, glove_embeddings):
    metric = load("squad_v2")
    model.eval()
    for batch in tqdm(val_dataloader, desc="Evaluating with Glove"):
      with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        glove_context_embeddings = batch['glove_context_embeddings'].to(device)
        glove_question_embeddings = batch['glove_question_embeddings'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, glove_context_embeddings=glove_context_embeddings, glove_question_embeddings=glove_question_embeddings, start_positions=start_positions, end_positions=end_positions)

        start_logits = outputs.start_logits.detach().cpu().numpy()
        end_logits = outputs.end_logits.detach().cpu().numpy()

      for i in range(len(input_ids)):
          answer_start = torch.argmax(outputs.start_logits[i]).item()
          answer_end = torch.argmax(outputs.end_logits[i]).item()
          tokens = tokenizer.convert_ids_to_tokens(input_ids[i].cpu().numpy())
          predicted_answer_tokens = tokens[answer_start:answer_end+1]
          predicted_answer = tokenizer.convert_tokens_to_string(predicted_answer_tokens)

          true_answer = ""
          offset_mapping = batch['offset_mapping'][i].cpu().numpy()
          for j in range(len(offset_mapping)):
                offset_start, offset_end = offset_mapping[j]
                if offset_start <= len(contexts[i]) and offset_end <=len(contexts[i]) and batch['start_positions'][i] == j and batch['end_positions'][i] ==j:
                  true_answer = contexts[i][offset_start:offset_end]
                  break

          metric.add(prediction_text=predicted_answer, reference_text=true_answer)

    eval_results = metric.compute()
    return eval_results

# Evaluate the model
eval_results_glove = evaluate_model_glove(model_glove, tokenizer, val_dataloader, device, glove_embeddings)
print("\nEvaluation Results with Glove:")
print(eval_results_glove)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loaded 100 samples of SQuAD data.
Sample 1:
  Context: Sheriff Tate arrives and discovers that Bob Ewell has died during the fight. The sheriff argues with Atticus about the prudence and ethics of charging Jem (whom Atticus believes to be responsible) or Boo (whom Tate believes to be responsible). Atticus eventually accepts the sheriff's story that Ewell simply fell on his own knife. Boo asks Scout to walk him home, and after she says goodbye to him at his front door he disappears again. While standing on the Radley porch, Scout imagines life from Boo's perspective, and regrets that they had never repaid him for the gifts he had given them.
  Question: What was the name of the police officer who discovered Bob Ewell's body?
  Answer: Sheriff Tate
--------------------
Sample 2:
  Context: Beginning in 1689, the colonies became involved in a series of wars between Great Britain and France for control of North America, the most important of which were Queen Anne's War, in which the Britis

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
