# [Sentence-BERT](https://arxiv.org/pdf/1908.10084.pdf)

[Reference Code](https://www.pinecone.io/learn/series/nlp/train-sentence-transformers-softmax/)

In [2]:
import os
import math
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Set GPU device
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## 1. Data

### Train, Test, Validation 

In [3]:
import datasets
snli = datasets.load_dataset('snli')
mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features, snli['train'].features

  from .autonotebook import tqdm as notebook_tqdm


({'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
  'idx': Value(dtype='int32', id=None)},
 {'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)})

In [4]:
# import json
# import datasets
# from datasets import ClassLabel

# dataset_file = r"C:\Users\Tairo Kageyama\Documents\GitHub\Python-fo-Natural-Language-Processing-main\lab5\snli_1.0\snli_1.0\snli_1.0_train_test.jsonl"

# premise = []
# hypothesis = []
# label = []

# with open(dataset_file, "r", encoding="utf-8") as f:
#     for line in f:
#         data = json.loads(line)
#         premise.append(data["sentence1"])
#         hypothesis.append(data["sentence2"])
#         label.append(data["gold_label"])

# features = {
#     "premise": premise,
#     "hypothesis": hypothesis,
#     "label": label
# }

# custom_dataset = datasets.Dataset.from_dict(features)

# custom_dataset.features['label'] = ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)

# custom_dataset.features

# # custom_dataset = custom_dataset.map(lambda example: {"label": example["label"]}, features=datasets.Features({"label": ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}))
# # custom_dataset = custom_dataset.map(lambda example: {"premise": example["premise"], "hypothesis": example["hypothesis"]})
# # custom_dataset.features


In [5]:
# List of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [6]:
# Remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [7]:
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [8]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([-1,  0,  1,  2]))

In [9]:
# there are -1 values in the label feature, these are where no class could be decided so we remove
snli = snli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

In [10]:
import numpy as np
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])
#snli also have -1

(array([0, 1, 2]), array([0, 1, 2]))

In [11]:
# Assuming you have your two DatasetDict objects named snli and mnli
from datasets import DatasetDict
# Merge the two DatasetDict objects
raw_dataset = DatasetDict({
    'train': datasets.concatenate_datasets([snli['train'], mnli['train']]).shuffle(seed=55).select(list(range(1000))),
    'test': datasets.concatenate_datasets([snli['test'], mnli['test_mismatched']]).shuffle(seed=55).select(list(range(100))),
    'validation': datasets.concatenate_datasets([snli['validation'], mnli['validation_mismatched']]).shuffle(seed=55).select(list(range(1000)))
})
#remove .select(list(range(1000))) in order to use full dataset
# Now, merged_dataset_dict contains the combined datasets from snli and mnli
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

## 2. Preprocessing

In [12]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [13]:
def preprocess_function(examples):
    max_seq_length = 128
    padding = 'max_length'
    # Tokenize the premise
    premise_result = tokenizer(
        examples['premise'], padding=padding, max_length=max_seq_length, truncation=True)
    print(premise_result)
    #num_rows, max_seq_length
    # Tokenize the hypothesis
    hypothesis_result = tokenizer(
        examples['hypothesis'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Extract labels
    labels = examples["label"]
    #num_rows
    return {
        "premise_input_ids": premise_result["input_ids"],
        "premise_attention_mask": premise_result["attention_mask"],
        "hypothesis_input_ids": hypothesis_result["input_ids"],
        "hypothesis_attention_mask": hypothesis_result["attention_mask"],
        "labels" : labels
    }

tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")

In [14]:
tokenized_datasets['train'][0]

{'premise_input_ids': tensor([ 101, 2092, 2027, 1005, 2128, 4795,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'premise_attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   

## 3. Data loader

In [15]:
from torch.utils.data import DataLoader

# initialize the dataloader
batch_size = 32
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size
)

In [16]:
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32, 128])
torch.Size([32])


## 4. Model

In [17]:
# start from a pretrained bert-base-uncased model
from transformers import BertTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [18]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

## 5. Loss Function

## Classification Objective Function 
We concatenate the sentence embeddings $u$ and $v$ with the element-wise difference  $\lvert u - v \rvert $ and multiply the result with the trainable weight  $ W_t ∈  \mathbb{R}^{3n \times k}  $:

$ o = \text{softmax}\left(W^T \cdot \left(u, v, \lvert u - v \rvert\right)\right) $

where $n$ is the dimension of the sentence embeddings and k the number of labels. We optimize cross-entropy loss. This structure is depicted in Figure 1.

## Regression Objective Function. 
The cosine similarity between the two sentence embeddings $u$ and $v$ is computed (Figure 2). We use means quared-error loss as the objective function.

(Manhatten / Euclidean distance, semantically  similar sentences can be found.)

<img src="./figures/sbert-architecture.png" >

In [19]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

<img src="./figures/sbert-ablation.png" width="350" height="300">

In [20]:
classifier_head = torch.nn.Linear(768*3, 3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [21]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()



## 6. Training

In [22]:
# from tqdm.auto import tqdm

# num_epoch = 2
# # 1 epoch should be enough, increase if wanted
# for epoch in range(num_epoch):
#     model.train()  
#     classifier_head.train()
#     # initialize the dataloader loop with tqdm (tqdm == progress bar)
#     for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
#         # zero all gradients on each new step
#         optimizer.zero_grad()
#         optimizer_classifier.zero_grad()
        
#         # prepare batches and more all to the active device
#         inputs_ids_a = batch['premise_input_ids'].to(device)
#         inputs_ids_b = batch['hypothesis_input_ids'].to(device)
#         attention_a = batch['premise_attention_mask'].to(device)
#         attention_b = batch['hypothesis_attention_mask'].to(device)
#         label = batch['labels'].to(device)
        
#         # extract token embeddings from BERT at last_hidden_state
#         u = model(inputs_ids_a, attention_mask=attention_a)  
#         v = model(inputs_ids_b, attention_mask=attention_b)  

#         u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
#         v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

#          # get the mean pooled vectors
#         u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
#         v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
#         # build the |u-v| tensor
#         uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
#         uv_abs = torch.abs(uv) # batch_size,hidden_dim
        
#         # concatenate u, v, |u-v|
#         x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim
        
#         # process concatenated tensor through classifier_head
#         x = classifier_head(x) #batch_size, classifer
        
#         # calculate the 'softmax-loss' between predicted and true label
#         loss = criterion(x, label)
        
#         # using loss, calculate gradients and then optimizerize
#         loss.backward()
#         optimizer.step()
#         optimizer_classifier.step()

#         scheduler.step() # update learning rate scheduler
#         scheduler_classifier.step()
        
#     print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')
#     if epoch == num_epoch-1:
#         torch.save(model.state_dict(), 'S-BERT.pt')
#         torch.save(model.state_dict(), 'S-BERT.pth')

In [24]:
trained = torch.load(r'C:\Users\Tairo Kageyama\Documents\GitHub\Python-fo-Natural-Language-Processing-main\lab5\model\S-BERT.pt')
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)
model.load_state_dict(trained)
model.eval()
classifier_head.eval()
total_similarity = 0
with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        label = batch['labels'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model(inputs_ids_a, attention_mask=attention_a)[0]  # all token embeddings A = batch_size, seq_len, hidden_dim
        v = model(inputs_ids_b, attention_mask=attention_b)[0]  # all token embeddings B = batch_size, seq_len, hidden_dim

        # get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1) # batch_size, hidden_dim

        similarity_score = cosine_similarity(u_mean_pool, v_mean_pool)
        total_similarity += similarity_score
    
average_similarity = total_similarity / len(eval_dataloader)
print(f"Average Cosine Similarity: {average_similarity:.4f}")

KeyboardInterrupt: 

## 7. Inference

In [27]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

trained = torch.load(r'C:\Users\Tairo Kageyama\Documents\GitHub\Python-fo-Natural-Language-Processing-main\lab5\model\S-BERT.pt')
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)
model.load_state_dict(trained)

def calculate_similarity(model, tokenizer, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs_a = tokenizer(sentence_a, return_tensors='pt', truncation=True, padding=True).to(device)
    inputs_b = tokenizer(sentence_b, return_tensors='pt', truncation=True, padding=True).to(device)

    # Move input IDs and attention masks to the active device
    inputs_ids_a = inputs_a['input_ids']
    attention_a = inputs_a['attention_mask']
    inputs_ids_b = inputs_b['input_ids']
    attention_b = inputs_b['attention_mask']

    # Extract token embeddings from BERT
    u = model(inputs_ids_a, attention_mask=attention_a)[0]  # all token embeddings A = batch_size, seq_len, hidden_dim
    v = model(inputs_ids_b, attention_mask=attention_b)[0]  # all token embeddings B = batch_size, seq_len, hidden_dim

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim

    # Calculate cosine similarity
    print(u, v)
    # print(u.reshape(1, -1), v.reshape(1, -1))
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score

# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

[ 1.49784118e-01  1.56582311e-01 -1.35506585e-01  2.07037881e-01
  1.88439071e-01 -7.51025081e-02  5.41316748e-01  9.45977449e-01
 -4.87500101e-01 -8.27707112e-01  5.08777678e-01 -2.37404823e-01
  1.30652368e-01  4.14803892e-01 -1.38623402e-01  8.38009920e-03
  2.32306585e-01  1.02923207e-01 -2.94782400e-01 -7.64665604e-02
 -5.61421573e-01 -3.46141249e-01  1.02922425e-01  8.76278520e-01
  4.38546479e-01  8.52520242e-02  2.18256444e-01  2.14446187e-01
  7.51448125e-02 -1.82092443e-01  2.42360160e-01  2.31781956e-02
 -1.84453458e-01 -1.27205297e-01 -8.32326487e-02 -1.74275592e-01
 -1.35676101e-01  1.55047458e-02 -4.55836505e-01  9.12900269e-02
 -3.02345484e-01 -1.65715992e-01  1.11455202e-01 -1.75064743e-01
 -1.05880037e-01 -3.26010197e-01  9.96159464e-02  3.43973860e-02
  4.56195742e-01 -1.00351237e-01 -4.55192626e-01  1.34634554e-01
 -2.33147651e-01 -1.15132384e-01  1.82590514e-01  4.23503816e-01
 -5.30576110e-01 -3.60495746e-01 -6.45343959e-01 -1.54923320e-01
  2.87239105e-01  1.89003

Evaluation

In [None]:
Syntactic = "[CLS] fermat's last [MASK] [SEP]"
Semantic = "[CLS] fermat's last theorem is among the most notable theorems in the history of [MASK] [SEP]"

Syntactic = tokenizer(Syntactic, padding='max_length', max_length=5, truncation=True)
Semantic= tokenizer(Semantic, padding='max_length', max_length=16, truncation=True)


# Syntactic_id = []
# Semantic_id = []

# for Sy in Syntatic:
#     Syntactic_id.append(word2id[Sy])

# for Se in Semantic:
#     Semantic_id.append(word2id[Se])

# print(type(Syntactic['input_ids']))
# print(Semantic)

input_syn = torch.tensor([Syntactic['input_ids']])
input_sem = torch.tensor([Semantic['input_ids']])
seg_syn = torch.tensor([Syntactic['token_type_ids']])
seg_sem = torch.tensor([Semantic['token_type_ids']])
mask_syn = torch.tensor([[3]])
mask_sem = torch.tensor([[14]])
att_syn = torch.tensor([Syntactic['attention_mask']])
att_sem = torch.tensor([Semantic['attention_mask']])

In [None]:
from transformers import BertTokenizer, BertModel

text = "Replace this text with your own sentence to predict its BERT embeddings."
inputs = tokenizer(text, return_tensors='pt')

trained = torch.load(r'C:\Users\Tairo Kageyama\Documents\GitHub\Python-fo-Natural-Language-Processing-main\lab5\model\S-BERT.pt')
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)
model.load_state_dict(trained)
# logits_lm, logits_nsp = model(input_syn, seg_syn, mask_syn)

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

print("Shape of last hidden states:", last_hidden_states.shape)

Shape of last hidden states: torch.Size([1, 18, 768])


In [None]:
# from transformers import BertTokenizer, BertForMaskedLM
# import torch

# # BERTのトークナイザーを読み込む
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # テスト文を定義する
# Syntactic = "[CLS] fermat's last [MASK] [SEP]"

# # テキストをトークン化し、トークンIDに変換する
# tokenized_text = tokenizer.tokenize(Syntactic)
# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# # マスクされたトークンの位置を特定する
# masked_index = tokenized_text.index('[MASK]')

# # トークンをテンソルに変換する
# tokens_tensor = torch.tensor([indexed_tokens])

# # BERTのモデルを読み込む
# inputs = tokenizer(Syntactic, return_tensors='pt')
# trained = torch.load(r'C:\Users\Tairo Kageyama\Documents\GitHub\Python-fo-Natural-Language-Processing-main\lab5\model\S-BERT.pt')
# model = BertModel.from_pretrained('bert-base-uncased')
# model.to(device)
# model.load_state_dict(trained)
# model.eval()

# # テキストをBERTモデルに入力して、マスクされたトークンを予測する
# with torch.no_grad():
#     outputs = model(tokens_tensor)
#     predictions = outputs[0][0, masked_index].topk(5)  # トップ5の予測を取得

# # 予測されたトークンを出力する
# predicted_tokens = tokenizer.convert_ids_to_tokens(predictions.indices.tolist())
# print("Predicted tokens:", predicted_tokens)


Predicted tokens: ['[unused241]', '[unused652]', '[unused687]', '[unused492]', '[unused632]']
