In [1]:
!pip install datasets tokenizers torch tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
!pip install --upgrade datasets fsspec huggingface_hub

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.33.1-py3-none-any.whl.metadata (14 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.33.1-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.4/515.4 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, huggingface_hub, datasets
  Attempting uninstall: fsspec
    Found 

# Define library

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
from scipy.stats import spearmanr

# Hyperparameters & Setup

In [26]:
MODEL_NAME = 'bert-base-uncased'
MAX_LEN    = 128
BATCH_SIZE = 32
LR         = 2e-5
WEIGHT_DECAY = 0.01
EPOCHS     = 5
DEVICE     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Sbert from scratch

In [33]:
class SBERT_from_scratch(nn.Module):
    def __init__(self, model_name: str = MODEL_NAME):
        super(SBERT_from_scratch, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)

    def _mean_pooling(self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Mean-pool token embeddings, masking out padding tokens.
        Args:
            token_embeddings: (batch_size, seq_len, hidden_size)
            attention_mask:   (batch_size, seq_len)
        Returns:
            Tensor of shape (batch_size, hidden_size)
        """
        # Create mask of shape (batch_size, seq_len, 1)
        mask = attention_mask.unsqueeze(-1).float()
        # Apply mask and sum
        summed = torch.sum(token_embeddings * mask, dim=1)
        # Calculate number of valid tokens per sample
        counts = torch.clamp(mask.sum(dim=1), min=1e-9)
        # Compute mean embeddings
        return summed / counts

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        # BERT returns a BaseModelOutput with .last_hidden_state
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Get token embeddings
        token_embeddings = output.last_hidden_state  # (batch, seq, hidden)
        # Mean pooling
        pooled = self._mean_pooling(token_embeddings, attention_mask)
        # L2 normalize
        return F.normalize(pooled, p=2, dim=1)

# Instantiate model
model = SBERT_from_scratch().to(DEVICE)

def cosine_score(a, b):
    return torch.nn.functional.cosine_similarity(a, b)

model = SBERT_from_scratch().to(DEVICE)

# Define sts dataset loader

In [34]:
class STSDataset(Dataset):
    def __init__(self, split):
        self.dataset = load_dataset('mteb/stsbenchmark-sts', split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return item['sentence1'], item['sentence2'], item['score'] / 5.0


def collate_fn(batch):
    s1, s2, scores = zip(*batch)
    enc1 = tokenizer(list(s1), padding='max_length', truncation=True,
                     max_length=MAX_LEN, return_tensors='pt')
    enc2 = tokenizer(list(s2), padding='max_length', truncation=True,
                     max_length=MAX_LEN, return_tensors='pt')
    return (enc1['input_ids'], enc1['attention_mask'],
            enc2['input_ids'], enc2['attention_mask'],
            torch.tensor(scores, dtype=torch.float32))

train_loader = DataLoader(STSDataset('train'), batch_size=BATCH_SIZE,
                          shuffle=True, collate_fn=collate_fn)
val_loader   = DataLoader(STSDataset('validation'), batch_size=BATCH_SIZE,
                          shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(STSDataset('test'), batch_size=BATCH_SIZE,
                          shuffle=False, collate_fn=collate_fn)

# Training

In [36]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
criterion = nn.MSELoss()

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for ids1, mask1, ids2, mask2, scores in tqdm(train_loader, desc=f"Epoch {epoch}"):
        ids1, mask1, ids2, mask2, scores = (
            ids1.to(DEVICE), mask1.to(DEVICE),
            ids2.to(DEVICE), mask2.to(DEVICE),
            scores.to(DEVICE)
        )
        emb1 = model(ids1, mask1)
        emb2 = model(ids2, mask2)
        preds = F.cosine_similarity(emb1, emb2)
        loss = criterion(preds, scores)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * ids1.size(0)

    avg_loss = total_loss / len(train_loader.dataset)
    # Validation
    model.eval()
    all_preds, all_scores = [], []
    with torch.no_grad():
        for ids1, mask1, ids2, mask2, scores in val_loader:
            ids1, mask1, ids2, mask2 = (
                ids1.to(DEVICE), mask1.to(DEVICE),
                ids2.to(DEVICE), mask2.to(DEVICE)
            )
            emb1 = model(ids1, mask1)
            emb2 = model(ids2, mask2)
            sims = F.cosine_similarity(emb1, emb2)
            all_preds.extend(sims.cpu().tolist())
            all_scores.extend(scores.tolist())
    val_rho = spearmanr(all_preds, all_scores).correlation
    print(f"Epoch {epoch}: Loss={avg_loss:.4f}, Val Spearman ρ={val_rho:.4f}")

Epoch 1:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 1: Loss=0.0179, Val Spearman ρ=0.8636


Epoch 2:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 2: Loss=0.0098, Val Spearman ρ=0.8679


Epoch 3:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 3: Loss=0.0065, Val Spearman ρ=0.8693


Epoch 4:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 4: Loss=0.0052, Val Spearman ρ=0.8700


Epoch 5:   0%|          | 0/180 [00:00<?, ?it/s]

Epoch 5: Loss=0.0045, Val Spearman ρ=0.8714


# Evaluation

In [37]:
model.eval()
all_preds, all_scores = [], []
with torch.no_grad():
    for ids1, mask1, ids2, mask2, scores in test_loader:
        ids1, mask1, ids2, mask2 = (
            ids1.to(DEVICE), mask1.to(DEVICE),
            ids2.to(DEVICE), mask2.to(DEVICE)
        )
        sims = F.cosine_similarity(model(ids1, mask1), model(ids2, mask2))
        all_preds.extend(sims.cpu().tolist())
        all_scores.extend(scores.tolist())
print(f"Test Spearman ρ = {spearmanr(all_preds, all_scores).correlation:.4f}")

Test Spearman ρ = 0.8418


# Quick demo

In [40]:
def demo(s1, s2):
    enc1 = tokenizer([s1], padding='max_length', truncation=True,
                     max_length=MAX_LEN, return_tensors='pt')
    enc2 = tokenizer([s2], padding='max_length', truncation=True,
                     max_length=MAX_LEN, return_tensors='pt')
    with torch.no_grad():
        v1 = model(enc1['input_ids'].to(DEVICE), enc1['attention_mask'].to(DEVICE))
        v2 = model(enc2['input_ids'].to(DEVICE), enc2['attention_mask'].to(DEVICE))
        sim = F.cosine_similarity(v1, v2).item()
    print(f"Similarity({s1!r}, {s2!r}) = {sim:.4f}")

print("\n▶ Demo:")
demo("A man is playing a guitar.", "A person is strumming an instrument.")
demo("The cat sat on the mat.", "It is raining cats and dogs.")


▶ Demo:
Similarity('A man is playing a guitar.', 'A person is strumming an instrument.') = 0.4482
Similarity('The cat sat on the mat.', 'It is raining cats and dogs.') = 0.4061


# (Optional) Compare with pre-trained Sbert

In [39]:
!pip install --quiet datasets sentence-transformers scipy

import torch
from datasets import load_dataset
from scipy.stats import spearmanr
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

# 2. Hyperparameters
PRETRAINED_MODEL_NAME = "all-MiniLM-L6-v2"
BATCH_SIZE = 64

# 3. Load the STS-Benchmark splits
sts_test = load_dataset("mteb/stsbenchmark-sts", split="test")

# 4. Load the pre-trained SBERT model
device = "cuda" if torch.cuda.is_available() else "cpu"
sbert = SentenceTransformer(PRETRAINED_MODEL_NAME, device=device)

# 5. Prepare lists of sentences and normalized scores
sentences1 = sts_test["sentence1"]
sentences2 = sts_test["sentence2"]
gold_scores = [score / 5.0 for score in sts_test["score"]]  # scale to [0,1]

# 6. Encode in batches to get embeddings
embeddings1 = []
embeddings2 = []
for start in tqdm(range(0, len(sentences1), BATCH_SIZE), desc="Encoding"):
    end = start + BATCH_SIZE
    batch_s1 = sentences1[start:end]
    batch_s2 = sentences2[start:end]
    emb1 = sbert.encode(batch_s1, convert_to_tensor=True)
    emb2 = sbert.encode(batch_s2, convert_to_tensor=True)
    embeddings1.append(emb1)
    embeddings2.append(emb2)

embeddings1 = torch.cat(embeddings1)
embeddings2 = torch.cat(embeddings2)

# 7. Compute cosine similarities for each pair
cos_sims = torch.nn.functional.cosine_similarity(embeddings1, embeddings2)

# 8. Compute Spearman correlation
spearman_corr = spearmanr(cos_sims.cpu().numpy(), gold_scores).correlation
print(f"\nPre-trained SBERT ({PRETRAINED_MODEL_NAME}) Spearman ρ = {spearman_corr:.4f}")

# 9. Quick side-by-side demo
def demo_pretrained(s1, s2):
    emb1 = sbert.encode([s1], convert_to_tensor=True)
    emb2 = sbert.encode([s2], convert_to_tensor=True)
    sim = torch.nn.functional.cosine_similarity(emb1, emb2).item()
    print(f"✔ {PRETRAINED_MODEL_NAME} → Similarity({s1!r}, {s2!r}) = {sim:.4f}")

print("\n▶ Pre-trained SBERT Demo:")
demo_pretrained("A man is playing a guitar.", "A person is strumming an instrument.")
demo_pretrained("The cat sat on the mat.",        "It is raining cats and dogs.")


Encoding:   0%|          | 0/22 [00:00<?, ?it/s]


Pre-trained SBERT (all-MiniLM-L6-v2) Spearman ρ = 0.8203

▶ Pre-trained SBERT Demo:
✔ all-MiniLM-L6-v2 → Similarity('A man is playing a guitar.', 'A person is strumming an instrument.') = 0.5970
✔ all-MiniLM-L6-v2 → Similarity('The cat sat on the mat.', 'It is raining cats and dogs.') = 0.3078


In [None]:
import nbformat

nb = nbformat.read("Sbert_from_scratch.ipynb", as_version=4)

if "widgets" in nb["metadata"] and "state" not in nb["metadata"]["widgets"]:
    nb["metadata"]["widgets"]["state"] = {}

nbformat.write(nb, "Sbert_from_scratch.ipynb")
