# Sentence Transformers (SBERT) with PyTorch: Similarity and Semantic Search

**Youtube Link:** https://www.youtube.com/watch?v=nZ5j289WN8g

In [1]:
import warnings
warnings.filterwarnings("ignore")

import torch

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
device

device(type='mps')

In [4]:
! system_profiler SPDisplaysDataType

Graphics/Displays:

    Intel UHD Graphics 630:

      Chipset Model: Intel UHD Graphics 630
      Type: GPU
      Bus: Built-In
      VRAM (Dynamic, Max): 1536 MB
      Vendor: Intel
      Device ID: 0x3e9b
      Revision ID: 0x0002
      Automatic Graphics Switching: Supported
      gMux Version: 5.0.0
      Metal Support: Metal 3

    Radeon Pro 560X:

      Chipset Model: Radeon Pro 560X
      Type: GPU
      Bus: PCIe
      PCIe Lane Width: x8
      VRAM (Total): 4 GB
      Vendor: AMD (0x1002)
      Device ID: 0x67ef
      Revision ID: 0x00c2
      ROM Revision: 113-C980AL-075
      VBIOS Version: 113-C97501U-005
      EFI Driver Version: 01.A1.075
      Automatic Graphics Switching: Supported
      gMux Version: 5.0.0
      Metal Support: Metal 2
      Displays:
        Color LCD:
          Display Type: Built-In Retina LCD
          Resolution: 2880 x 1800 Retina
          Framebuffer Depth: 24-Bit Color (ARGB8888)
          Main Display: Yes
          Mirror: Off
          Onl

In [5]:
! system_profiler SPDisplaysDataType > gpu_info.txt

In [6]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

Python implementation: CPython
Python version       : 3.10.0
IPython version      : 8.26.0

numpy       : 1.26.4
pandas      : 2.2.2
torch       : 2.2.2
transformers: 4.44.0



In [15]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-mpnet-base-v2", device='mps')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [22]:
# This is the maximum number of characters/tokens that the model can take
model.max_seq_length

384

In [23]:
corpus = [
    "#Bitcoin is up 34% since Bank of England Governor said 'Be prepared to lose all your money in BTC and Crypto.'",
    "One of the biggest Bull traps I've ever seen.",
    "How I sleep knowing etherum is going to 10k in 2023",
    "Bitcoin is a scam",
    "IS THE $BTC BOTTOM IN? Are you team BULL or team BEAR? Do you have the DATA to prove your point??",
    "I will stop bragging about calling the top when I start bragging about calling the bottom. #bitcoin",
    "First powerlifting meet of the year and a new squat PR!!",
    "On January 9th, 2023, the American Academy of Pediatrics published new guidelines treating obesity in children and adolescents.",
    "What's worse, someone dropping the bar from the top of a deadlift or not putting their shopping cart away?",
    "The sport of powerlifting includes squat bench and deadlift. But the concept of powerlifting is Force= Mass x Acceleration.",
]

In [24]:
corpus_embeddings = model.encode(corpus, show_progress_bar=True, convert_to_tensor=True)
corpus_embeddings

Batches: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


tensor([[-0.0671,  0.0218, -0.0305,  ..., -0.0016, -0.0345, -0.0030],
        [-0.0055, -0.0304,  0.0158,  ...,  0.0144,  0.0186,  0.0054],
        [-0.0240,  0.0824, -0.0400,  ...,  0.0156, -0.0155, -0.0518],
        ...,
        [ 0.0721,  0.0978,  0.0143,  ..., -0.0074, -0.0561, -0.0374],
        [-0.0283,  0.0284,  0.0064,  ..., -0.0165, -0.0167, -0.0041],
        [-0.0032, -0.0487,  0.0164,  ..., -0.0162,  0.0181, -0.0025]],
       device='mps:0')

In [25]:
corpus_embeddings.shape

torch.Size([10, 768])

In [26]:
query = "How high will bitcoin go?"

In [27]:
query_embeddings = model.encode(query, show_progress_bar=True, convert_to_tensor=True)
query_embeddings

Batches: 100%|██████████| 1/1 [00:05<00:00,  5.11s/it]


tensor([-1.6440e-02,  3.2738e-02, -3.3854e-02,  2.5782e-03,  4.6909e-02,
        -1.0622e-02, -4.9374e-02, -2.5929e-02,  4.1010e-02, -3.7868e-03,
         8.4012e-03,  4.4253e-03, -3.8478e-02,  1.1281e-01,  6.4806e-03,
        -4.7081e-02,  4.1241e-02, -2.2989e-02,  1.0063e-02, -4.0438e-02,
        -1.6192e-02, -5.6703e-02,  4.1887e-02, -2.9924e-03, -2.0055e-02,
         9.2723e-03,  2.3443e-02,  4.1819e-02, -3.8099e-02, -5.1336e-03,
        -7.1172e-02, -2.7961e-02, -4.7446e-02,  5.1631e-02,  1.4831e-06,
        -4.5730e-02,  6.1654e-04,  3.3853e-02, -1.7810e-02, -8.5618e-02,
        -8.5605e-02, -2.6440e-02, -9.6930e-03,  1.9836e-02, -1.2204e-02,
         2.4640e-02, -4.3704e-02,  2.7261e-02, -2.3189e-02,  2.2393e-02,
        -2.7534e-03, -6.5008e-02,  5.6481e-02,  9.2656e-03,  2.6180e-02,
        -4.4296e-02,  1.7331e-02,  5.5997e-02, -7.0871e-03,  3.7933e-02,
         2.9549e-03,  2.5222e-02,  1.5936e-02, -5.9650e-04,  1.3558e-02,
         2.8841e-02,  1.6692e-02,  3.8148e-02, -3.1

In [28]:
query_embeddings.shape

torch.Size([768])

In [29]:
util.cos_sim(query_embeddings, corpus_embeddings[0])
# query and corpus[0] is similar

tensor([[0.4349]], device='mps:0')

In [30]:
util.cos_sim(query_embeddings, corpus_embeddings[9])
# query and corpus[9] is not similar

tensor([[-0.0232]], device='mps:0')

In [38]:
result = util.semantic_search(query_embeddings, corpus_embeddings)[0]
result

[{'corpus_id': 2, 'score': 0.48030221462249756},
 {'corpus_id': 0, 'score': 0.4349438548088074},
 {'corpus_id': 4, 'score': 0.3824135363101959},
 {'corpus_id': 5, 'score': 0.3790991008281708},
 {'corpus_id': 3, 'score': 0.3513832986354828},
 {'corpus_id': 1, 'score': 0.0936562716960907},
 {'corpus_id': 6, 'score': 0.08781059086322784},
 {'corpus_id': 7, 'score': 0.07335850596427917},
 {'corpus_id': 8, 'score': 0.020925577729940414},
 {'corpus_id': 9, 'score': -0.023243635892868042}]

In [39]:
for item in result:
    print(round(item["score"], 2), "|", corpus[item["corpus_id"]])

0.48 | How I sleep knowing etherum is going to 10k in 2023
0.43 | #Bitcoin is up 34% since Bank of England Governor said 'Be prepared to lose all your money in BTC and Crypto.'
0.38 | IS THE $BTC BOTTOM IN? Are you team BULL or team BEAR? Do you have the DATA to prove your point??
0.38 | I will stop bragging about calling the top when I start bragging about calling the bottom. #bitcoin
0.35 | Bitcoin is a scam
0.09 | One of the biggest Bull traps I've ever seen.
0.09 | First powerlifting meet of the year and a new squat PR!!
0.07 | On January 9th, 2023, the American Academy of Pediatrics published new guidelines treating obesity in children and adolescents.
0.02 | What's worse, someone dropping the bar from the top of a deadlift or not putting their shopping cart away?
-0.02 | The sport of powerlifting includes squat bench and deadlift. But the concept of powerlifting is Force= Mass x Acceleration.


# Lets try with another query

In [40]:
query_2 = "How much should I deadlift at 82 kg bodyweight?"

In [41]:
query2_embeddings = model.encode(query_2, show_progress_bar=True, convert_to_tensor=True)
query2_embeddings

Batches: 100%|██████████| 1/1 [00:00<00:00, 19.95it/s]


tensor([ 4.2256e-02,  1.1967e-03,  8.6330e-03, -1.2649e-02,  1.1034e-02,
         1.6971e-02,  4.3388e-03, -7.4748e-03,  5.4104e-02,  1.9416e-03,
         2.5907e-02, -1.1454e-02, -1.6992e-02,  3.5316e-02,  7.2062e-03,
         6.2534e-02,  2.5968e-02, -3.7030e-02,  7.0303e-03, -4.8572e-02,
        -2.5945e-02, -5.3107e-02,  1.3602e-03, -1.9749e-02,  2.3767e-02,
         3.3390e-02,  2.5558e-03, -2.1845e-03, -1.3981e-02, -1.0333e-02,
        -5.7795e-02, -4.4480e-02,  3.1190e-02,  2.0615e-02,  1.4621e-06,
        -2.4923e-03,  6.9576e-02, -3.5369e-02, -3.9735e-03,  3.4671e-02,
        -4.9421e-02,  6.6175e-03,  4.1269e-02, -4.8071e-02, -6.1899e-04,
        -5.0947e-02,  1.4739e-02, -6.6379e-03,  1.8297e-02,  3.6581e-02,
         1.1360e-02, -1.5486e-02,  2.8479e-02,  3.3475e-02, -2.0449e-02,
        -9.6084e-03, -2.0978e-02,  8.7421e-02, -5.1352e-02,  5.5530e-02,
        -1.1669e-02,  3.1585e-02, -6.1537e-03,  3.7862e-02, -4.0668e-02,
        -7.3277e-04, -2.3658e-02,  1.3450e-02,  4.0

In [42]:
result_2 = util.semantic_search(query2_embeddings, corpus_embeddings)[0]
result_2

[{'corpus_id': 9, 'score': 0.32635003328323364},
 {'corpus_id': 8, 'score': 0.24540135264396667},
 {'corpus_id': 6, 'score': 0.13916906714439392},
 {'corpus_id': 7, 'score': 0.10069739818572998},
 {'corpus_id': 2, 'score': 0.047277357429265976},
 {'corpus_id': 0, 'score': 0.030338570475578308},
 {'corpus_id': 4, 'score': 0.01547253131866455},
 {'corpus_id': 5, 'score': 0.009970581158995628},
 {'corpus_id': 3, 'score': -0.015322110615670681},
 {'corpus_id': 1, 'score': -0.042820487171411514}]

In [43]:
for item in result_2:
    print(round(item["score"], 2), "|", corpus[item["corpus_id"]])

0.33 | The sport of powerlifting includes squat bench and deadlift. But the concept of powerlifting is Force= Mass x Acceleration.
0.25 | What's worse, someone dropping the bar from the top of a deadlift or not putting their shopping cart away?
0.14 | First powerlifting meet of the year and a new squat PR!!
0.1 | On January 9th, 2023, the American Academy of Pediatrics published new guidelines treating obesity in children and adolescents.
0.05 | How I sleep knowing etherum is going to 10k in 2023
0.03 | #Bitcoin is up 34% since Bank of England Governor said 'Be prepared to lose all your money in BTC and Crypto.'
0.02 | IS THE $BTC BOTTOM IN? Are you team BULL or team BEAR? Do you have the DATA to prove your point??
0.01 | I will stop bragging about calling the top when I start bragging about calling the bottom. #bitcoin
-0.02 | Bitcoin is a scam
-0.04 | One of the biggest Bull traps I've ever seen.


# Testing the similarity between two sentences

In [None]:
sentence_1 = "#Bitcoin is up 34% since Bank of England Governor said 'Be prepared to lose all your money in BTC and Crypto.'"
sentence_2 = "One of the biggest Bull traps I've ever seen."

Now we are going to make the embeddings of the above two sentences

In [31]:
sentence1_embeddings = model.encode(sentence_1, show_progress_bar=True, convert_to_tensor=True)
sentence2_embeddings = model.encode(sentence_2, show_progress_bar=True, convert_to_tensor=True)

Batches: 100%|██████████| 1/1 [00:00<00:00,  5.27it/s]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.23s/it]


In [32]:
print(sentence1_embeddings.shape)
print(sentence2_embeddings.shape)

torch.Size([768])
torch.Size([768])


In [34]:
util.cos_sim(sentence1_embeddings, sentence2_embeddings)
# The two sentences is similar but according to the model these are not similar as you can see the result

tensor([[0.1469]], device='mps:0')

# Train own model

In [1]:
from sentence_transformers import SentencesDataset, InputExample, losses
from torch.utils.data import DataLoader
from datasets import Dataset
from sentence_transformers import SentenceTransformer, util

import warnings
warnings.filterwarnings("ignore")

model = SentenceTransformer("all-mpnet-base-v2", device='cpu')
model

  from tqdm.autonotebook import tqdm, trange


SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [2]:
dataset = SentencesDataset([
    InputExample(
        texts=[
            "#Bitcoin is up 34% since Bank of England Governor said 'Be prepared to lose all your money in BTC and Crypto.'"
            "One of the biggest Bull traps I've ever seen."
        ], label=0.9
    )
], model)

In [3]:
dataloader = DataLoader(dataset, shuffle=True, batch_size=int(16))
print(dataloader)

loss = losses.CosineSimilarityLoss(model=model)
print(loss)

<torch.utils.data.dataloader.DataLoader object at 0x1129cbcd0>
CosineSimilarityLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
    (2): Normalize()
  )
  (loss_fct): MSELoss()
  (cos_score_transformation): Identity()
)


In [4]:
import os
save_path = "trained_model"
os.makedirs(save_path, exist_ok=True)

In [8]:
model.fit(train_objectives=[(dataloader, loss)], 
          epochs=10,
          warmup_steps=10,
          output_path=save_path)

# Another Method

In [1]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses
from torch.utils.data import DataLoader
import torch

# Load model with MPS (Metal Performance Shaders) support
model = SentenceTransformer("all-mpnet-base-v2")
print(model)

# Prepare dataset with multiple examples
dataset = SentencesDataset([
    InputExample(
        texts=[
            "#Bitcoin is up 34% since Bank of England Governor said 'Be prepared to lose all your money in BTC and Crypto.'",
            "One of the biggest Bull traps I've ever seen."
        ],
        label=0.9,
    ),
], model)

# Custom collate function
def collate_fn(batch):
    texts = [example.texts for example in batch]
    labels = torch.tensor([example.label for example in batch], dtype=torch.float)
    return texts, labels

# import os
# os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

# Initialize DataLoader with a smaller batch size and custom collate function
dataloader = DataLoader(dataset, shuffle=True, collate_fn=collate_fn)
print(f"Number of batches: {len(dataloader)}")

# Define loss function
loss = losses.CosineSimilarityLoss(model=model)
print(loss)

# Print shapes and types of the first batch
for batch in dataloader:
    print(f"Texts: {batch[0]}")
    print(f"Labels shape: {batch[1].shape}, Type: {batch[1].dtype}")
    break

# Train the model
model.fit(train_objectives=[(dataloader, loss)], 
          epochs=10, 
          warmup_steps=10, 
          output_path="trained_model")

print("Training completed successfully!")

  from tqdm.autonotebook import tqdm, trange


SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
Number of batches: 1
CosineSimilarityLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
    (2): Normalize()
  )
  (loss_fct): MSELoss()
  (cos_score_transformation): Id

100%|██████████| 10/10 [00:09<00:00,  1.01it/s]


{'train_runtime': 9.8525, 'train_samples_per_second': 1.015, 'train_steps_per_second': 1.015, 'train_loss': 0.3232328653335571, 'epoch': 10.0}


                                                                     

Training completed successfully!




# Another Method

In [7]:
from sentence_transformers import SentenceTransformer, InputExample, losses
import torch
from torch.utils.data import DataLoader, Dataset

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

# Load model
model = SentenceTransformer("all-mpnet-base-v2")
print(model)

# Prepare dataset
train_examples = [
    InputExample(texts=[
        "#Bitcoin is up 34% since Bank of England Governor said 'Be prepared to lose all your money in BTC and Crypto.'",
        "One of the biggest Bull traps I've ever seen."
        ], label=0.9),
]

train_dataset = CustomDataset(train_examples)

# DataLoader
train_dataloader = DataLoader(train_dataset, shuffle=True)

# Use the default train() method
train_loss = losses.CosineSimilarityLoss(model)

import os
save_path = "trained_model"
os.makedirs(save_path, exist_ok=True)

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    warmup_steps=10,
    output_path=save_path
)

print("Training completed successfully!")



SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


100%|██████████| 10/10 [00:08<00:00,  1.14it/s]


{'train_runtime': 8.7875, 'train_samples_per_second': 1.138, 'train_steps_per_second': 1.138, 'train_loss': 0.3232328653335571, 'epoch': 10.0}


                                                                     

Training completed successfully!




# Testing of our trained the model

In [2]:
from sentence_transformers import SentenceTransformer, util

In [3]:
trained_model = SentenceTransformer("trained_model")

sentence_1 = "#Bitcoin is up 34% since Bank of England Governor said 'Be prepared to lose all your money in BTC and Crypto.'"
sentence_2 = "One of the biggest Bull traps I've ever seen."

sentence1_embeddings = model.encode(sentence_1, show_progress_bar=True, convert_to_tensor=True)
sentence2_embeddings = model.encode(sentence_2, show_progress_bar=True, convert_to_tensor=True)

Batches: 100%|██████████| 1/1 [00:00<00:00, 12.19it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 10.41it/s]


In [4]:
util.cos_sim(sentence1_embeddings, sentence2_embeddings)

tensor([[0.8297]], device='mps:0')