# SENTENCE BERT

### 0. Import libraries

In [1]:
import os
import math
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from random import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### 1. ETL

- Load dataset

In [2]:
import datasets

snli_dataset = datasets.load_dataset('snli')
mnli_dataset = datasets.load_dataset('glue','mnli')
# mnli_dataset['train'].features, snli_dataset['train'].features

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
mnli_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [4]:
snli_dataset

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

In [5]:
mnli_dataset.column_names.keys(), mnli_dataset.column_names.values()

(dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched']),
 dict_values([['premise', 'hypothesis', 'label', 'idx'], ['premise', 'hypothesis', 'label', 'idx'], ['premise', 'hypothesis', 'label', 'idx'], ['premise', 'hypothesis', 'label', 'idx'], ['premise', 'hypothesis', 'label', 'idx']]))

In [6]:
snli_dataset.column_names.keys(), snli_dataset.column_names.values()

(dict_keys(['test', 'train', 'validation']),
 dict_values([['premise', 'hypothesis', 'label'], ['premise', 'hypothesis', 'label'], ['premise', 'hypothesis', 'label']]))

In [7]:
for column_names in mnli_dataset.column_names.keys():
    mnli_dataset[column_names] = mnli_dataset[column_names].remove_columns('idx')

In [8]:
mnli_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9847
    })
})

In [9]:
import numpy as np
np.unique(mnli_dataset['train']['label']), np.unique(snli_dataset['train']['label']) #snli also have -1 (no label)

(array([0, 1, 2]), array([-1,  0,  1,  2]))

In [10]:
# there are -1 values in the label feature, these are where no class could be decided so we remove

def filter_labels(x):
    if x['label'] == -1:
        return 0
    else:
        return 1

snli_dataset = snli_dataset.filter(filter_labels)
snli_dataset

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9824
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9842
    })
})

In [11]:
snli_dataset

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9824
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 9842
    })
})

- Merge two datasets

In [12]:
from datasets import DatasetDict

raw_dataset = DatasetDict({
    'train': datasets.concatenate_datasets([snli_dataset['train'],mnli_dataset['train']]).shuffle(seed=123).select(list(range(1000))),
    'test' : datasets.concatenate_datasets([snli_dataset['test'],mnli_dataset['test_mismatched']]).shuffle(seed=123).select(list(range(100))),
    'valid': datasets.concatenate_datasets([snli_dataset['validation'],mnli_dataset['validation_mismatched']]).shuffle(seed=123).select(list(range(1000)))
})

In [13]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 100
    })
    valid: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

### 2. Preprocessing

In [14]:
# from transformers import BertTokenizer

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [15]:
from utils import *

In [16]:
batch_size = 2

In [17]:
max_len

1000

In [18]:
import torch

class customTokenizer:
    def __init__(self, word2id):
        self.word2id = word2id
        self.id2word = {id_: word for word, id_ in word2id.items()}
        self.vocab_size = len(word2id)
        self.max_len = max_len
        self.unk_token_id = word2id.get('[UNK]', None)  # Assuming '[UNK]' is a defined unknown token

    def encode(self, sentences):
        encoded_output = {'input_ids': [], 'attention_mask': []}

        for sentence in sentences:
            # Encode sentence to IDs, with fallback to '[UNK]' token ID
            sentence_ids = [self.word2id.get(word, self.unk_token_id) for word in sentence.split()]
            sentence_len = len(sentence_ids)
            
            # Padding if necessary
            if sentence_len < self.max_len:
                sentence_ids += [0] * (self.max_len - sentence_len)  # Assuming '0' is the pad token ID
                attention_mask = [1] * sentence_len + [0] * (self.max_len - sentence_len)
            else:
                sentence_ids = sentence_ids[:self.max_len]
                attention_mask = [1] * self.max_len

            # Convert lists to tensors before appending
            encoded_output['input_ids'].append(torch.tensor(sentence_ids))
            encoded_output['attention_mask'].append(torch.tensor(attention_mask))

        return encoded_output

    def decode(self, ids):
        # Ensure ids is a list of tensors before attempting to decode
        if isinstance(ids, torch.Tensor):
            ids = ids.tolist()
        return ' '.join(self.id2word.get(id_, '[UNK]') for id_ in ids)

In [19]:
tokenizer = customTokenizer(word2id)

In [20]:
def preprocess_function(examples):

    max_seq_length = 128
    padding        = 'max_length'

    # Tokenize the premise
    premise_result = tokenizer.encode(examples['premise'])
    #num_rows, max_seq_length

    # Tokenize the hypothesis
    hypothesis_result = tokenizer.encode(examples['hypothesis'])
    #num_rows, max_seq_length

    # Extract labels
    labels = examples["label"]

    #num_rows
    return {
        "premise_input_ids": premise_result["input_ids"],
        "premise_attention_mask": premise_result["attention_mask"],
        "hypothesis_input_ids": hypothesis_result["input_ids"],
        "hypothesis_attention_mask": hypothesis_result["attention_mask"],
        "labels" : labels
    }

tokenized_datasets = raw_dataset.map(preprocess_function,batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 100
    })
    valid: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [21]:
len(tokenized_datasets['train']['premise_input_ids'][999])

1000

In [22]:
tokenized_datasets['train']['premise_attention_mask'][1]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### 3. DataLoader

In [23]:
from torch.utils.data import DataLoader

# initialize the dataloader
# batch_size = 6

train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=batch_size, shuffle=True)
eval_dataloader  = DataLoader(tokenized_datasets['valid'], batch_size=batch_size)
test_dataloader  = DataLoader(tokenized_datasets['test'], batch_size=batch_size)

#batch_size, max_seq_length

In [24]:
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([2, 1000])
torch.Size([2, 1000])
torch.Size([2, 1000])
torch.Size([2, 1000])
torch.Size([2])


### 4. Model

In [25]:
model = BERT()
model.load_state_dict(torch.load('./model/bert_from_scratch.pth'))
model.to(device)

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(7485, 768)
    (pos_embed): Embedding(1000, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=512, bias=True)
        (W_K): Linear(in_features=768, out_features=512, bias=True)
        (W_V): Linear(in_features=768, out_features=512, bias=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (decod

In [26]:
# # start from a pretrained bert-base-uncased model
# from transformers import BertTokenizer, BertModel
# model = BertModel.from_pretrained('bert-base-uncased')
# model

#### Pooling

In [27]:
# define mean pooling function

def mean_pool(token_embeds, attention_mask):

    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()

    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
    
    return pool

### 5. Loss Function

In [28]:
def configurations(u,v):

    # build the |u-v| tensor
    uv     = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):

    dot_product = np.dot(u, v)
    norm_u      = np.linalg.norm(u)
    norm_v      = np.linalg.norm(v)
    similarity  = dot_product / (norm_u * norm_v)
    
    return similarity

In [29]:
classifier_head      = torch.nn.Linear(768*3, 3).to(device)

optimizer            = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [30]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps  = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler    = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps - warmup_steps)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(optimizer_classifier, num_warmup_steps=warmup_steps, num_training_steps=total_steps - warmup_steps)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



### 6. Training

In [31]:
from tqdm.auto import tqdm

num_epoch = 5
# 1 epoch should be enough, increase if wanted

segment_ids = torch.tensor([0] * max_len).unsqueeze(0).repeat(batch_size, 1).to(device)

masked_pos = torch.tensor([0] * max_mask).unsqueeze(0).repeat(batch_size, 1).to(device)


for epoch in range(num_epoch):
    
    model.train()  
    classifier_head.train()

    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a  = batch['premise_attention_mask'].to(device)
        attention_b  = batch['hypothesis_attention_mask'].to(device)
        label        = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u, _, _ = model(inputs_ids_a, segment_ids, masked_pos)
        v, _, _ = model(inputs_ids_b, segment_ids, masked_pos)
        

        u_last_hidden_state = u # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer
        
        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, label)
        
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()
        
    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')

100%|██████████| 500/500 [02:54<00:00,  2.86it/s]


Epoch: 1 | loss = 3.693784


100%|██████████| 500/500 [03:02<00:00,  2.73it/s]


Epoch: 2 | loss = 2.335129


100%|██████████| 500/500 [02:56<00:00,  2.83it/s]


Epoch: 3 | loss = 4.647350


100%|██████████| 500/500 [02:46<00:00,  3.00it/s]


Epoch: 4 | loss = 2.525723


100%|██████████| 500/500 [03:17<00:00,  2.53it/s]


Epoch: 5 | loss = 1.167101


In [32]:
model.eval()
classifier_head.eval()
total_similarity = 0

with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u, _, _ = model(inputs_ids_a, segment_ids, masked_pos)  # all token embeddings A = batch_size, seq_len, hidden_dim
        v, _, _ = model(inputs_ids_b, segment_ids, masked_pos)  # all token embeddings B = batch_size, seq_len, hidden_dim

        # get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim

        similarity_score = cosine_similarity(u_mean_pool, v_mean_pool)
        total_similarity += similarity_score
    
average_similarity = total_similarity / len(eval_dataloader)
print(f"Average Cosine Similarity: {average_similarity:.4f}")

Average Cosine Similarity: 0.9987


In [33]:
# save the model
torch.save(model.state_dict(), './model/sentenceBERT.pth')

In [34]:
model = BERT()
model.load_state_dict(torch.load('./model/sentenceBERT.pth'))
model.to(device)

BERT(
  (embedding): Embedding(
    (tok_embed): Embedding(7485, 768)
    (pos_embed): Embedding(1000, 768)
    (seg_embed): Embedding(2, 768)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (enc_self_attn): MultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=512, bias=True)
        (W_K): Linear(in_features=768, out_features=512, bias=True)
        (W_V): Linear(in_features=768, out_features=512, bias=True)
      )
      (pos_ffn): PoswiseFeedForwardNet(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
  (fc): Linear(in_features=768, out_features=768, bias=True)
  (activ): Tanh()
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (decod

### 7. Inference

In [35]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

segment_ids = torch.tensor([0] * max_len).unsqueeze(0).repeat(batch_size, 1).to(device)

masked_pos = torch.tensor([0] * max_mask).unsqueeze(0).repeat(batch_size, 1).to(device)



def calculate_similarity(model, tokenizer, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs_a = tokenizer.encode(sentence_a)
    inputs_b = tokenizer.encode(sentence_b)

    # Move input IDs and attention masks to the active device
    inputs_ids_a = inputs_a['input_ids'][0].unsqueeze(0).to(device)
    attention_a  = inputs_a['attention_mask'][0].unsqueeze(0).to(device)
    inputs_ids_b = inputs_b['input_ids'][0].unsqueeze(0).to(device)
    attention_b  = inputs_b['attention_mask'][0].unsqueeze(0).to(device)

    # Extract token embeddings from BERT
    u, _, _ = model(inputs_ids_a, segment_ids, masked_pos)  # all token embeddings A = batch_size, seq_len, hidden_dim
    v, _, _ = model(inputs_ids_b, segment_ids, masked_pos)# all token embeddings B = batch_size, seq_len, hidden_dim

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score

# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.9985


In [36]:
# Example usage 2:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "This is a totally different sentence."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.9990
