##SBERT finetuning
- Stackexchange duplicate dataset https://huggingface.co/datasets/sentence-transformers/stackexchange-duplicates

#### Dependencies

In [None]:
! pip install \
datasets==3.0.1 \
huggingface-hub==0.25.2 \
sentence-transformers==3.2.0 \
transformers==4.45.2 \
faiss-cpu==1.7.4 \
loguru

Collecting datasets==3.0.1
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting huggingface-hub==0.25.2
  Downloading huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting sentence-transformers==3.2.0
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting transformers==4.45.2
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu==1.7.4
  Downloading faiss_cpu-1.7.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.0.1)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets==3.0.1)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata 

#### Import data

In [None]:
from datasets import load_dataset
sbert_dataset = load_dataset("sentence-transformers/stackexchange-duplicates","post-post-pair")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/250519 [00:00<?, ? examples/s]

In [None]:
sbert_dataset = sbert_dataset.rename_columns({"post1": "anchor", "post2": "positive"})

In [None]:
#reduce dataset size
sbert_dataset['train'] = sbert_dataset['train'].select(range(4000))

In [None]:
sbert_dataset

DatasetDict({
    train: Dataset({
        features: ['anchor', 'positive'],
        num_rows: 4000
    })
})

Create Corpus

In [None]:
corpus = list(set(sbert_dataset['train']['anchor'] + sbert_dataset['train']['positive']))

In [None]:
len(corpus)

7455

#### Models to test
- NovaSearch/stella_en_400M_v5
- jinaai/jina-embeddings-v3
- Alibaba-NLP/gte-large-en-v1.5 (TOO LARGE, NOT ENOUGH GPU MEMORY)
- Alibaba-NLP/gte-en-v1.5

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from loguru import logger
import torch
from datasets import Dataset

# Load a pre-trained sentence transformer model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5",
                            trust_remote_code=True,
                            device=device
                            )

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

Using device: cuda


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/72.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

#### Generate negatives

In [None]:
def generate_corpus_embeddings(model, corpus):

  #Generate embeddings
  logger.info("Generating corpus embeddings...")
  corpus_embeddings = model.encode(corpus,
                                   convert_to_numpy=True,
                                   batch_size=32,  # Adjust batch size based on GPU memory
                                   device=device  # Run on GPU
                                  )

  # Normalize embeddings for cosine similarity
  corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

  #Generate FAISS index
  logger.info("Generating FAISS index...")
  index = faiss.IndexFlatIP(corpus_embeddings.shape[1])  # Inner Product index (cosine similarity after normalization)
  index.add(corpus_embeddings)

  return index

def get_negatives_from_corpus(model,
                              corpus,
                              dataset,
                              index,
                              n_negatives,
                              top_k=20,
                              anchor_col="anchor",
                              positive_col="positive"):

  anchors = dataset[anchor_col]
  positives = dataset[positive_col]

  #Generating query embeddings
  logger.info("Generating query embeddings...")
  query_embeddings = model.encode(anchors,
                                  convert_to_numpy=True,
                                  batch_size=32,  # Adjust batch size based on GPU memory
                                  device=device  # Run on GPU
                                  )

  #Normalizing query embeddings
  query_embeddings = query_embeddings / np.linalg.norm(query_embeddings, axis=1, keepdims=True)

  #Get most similar entries in corpus
  logger.info("Searching FAISS index...")
  similarities, indices = index.search(query_embeddings, top_k)

  #Formatting dataset correctly
  datasets_with_negatives = []
  for i, (anchor, positive) in enumerate(zip(anchors, positives)):

    entry = {anchor_col : anchor,
             positive_col : positive}

    negative_count, j = 0, 0
    while negative_count < n_negatives and negative_count in range(top_k):
      if corpus[indices[i][j]] != anchor and corpus[indices[i][j]] != positive:
        negative_count += 1
        if n_negatives <= 1:
          entry[f"negative"] = corpus[indices[i][j]]
        else:
          entry[f"negative_{negative_count}"] = corpus[indices[i][j]]
      j += 1

    datasets_with_negatives.append(entry)

  logger.info("Dataset with negatives generated")
  return Dataset.from_list(datasets_with_negatives)

In [None]:
index = generate_corpus_embeddings(model, corpus)

[32m2025-02-05 00:45:46.562[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_corpus_embeddings[0m:[36m4[0m - [1mGenerating corpus embeddings...[0m
[32m2025-02-05 00:48:57.062[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_corpus_embeddings[0m:[36m15[0m - [1mGenerating FAISS index...[0m


In [None]:
#If generating embeddings for the entire dataset takes too long, set the dataset to only the eval dataset (valid or test)
queries = sbert_dataset['train']
dataset_with_negatives = get_negatives_from_corpus(model=model,
                                                   corpus=corpus,
                                                   dataset=queries,
                                                   index=index,
                                                   n_negatives=2,
                                                   top_k=20,
                                                   anchor_col="anchor",
                                                   positive_col="positive")

[32m2025-02-05 00:48:57.119[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_negatives_from_corpus[0m:[36m34[0m - [1mGenerating query embeddings...[0m
[32m2025-02-05 00:50:46.394[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_negatives_from_corpus[0m:[36m45[0m - [1mSearching FAISS index...[0m
[32m2025-02-05 00:50:47.670[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_negatives_from_corpus[0m:[36m67[0m - [1mDataset with negatives generated[0m


In [None]:
dataset_with_negatives

Dataset({
    features: ['anchor', 'positive', 'negative_1', 'negative_2'],
    num_rows: 4000
})

In [None]:
dataset_with_negatives.to_pandas().head(10)

Unnamed: 0,anchor,positive,negative_1,negative_2
0,Power Series Solution Consider the initial val...,Convergence of the Power Series for $xu''+\sin...,Find a closed form for the power series $f_p(x...,Prove that the iteration of $\sin(x)$ goes to ...
1,Question on the existence of finite open subse...,Is every open subset of $ \mathbb{R} $ uncount...,Countable closed sets There is a theorem that ...,Is the collection of finite subsets of $\mathb...
2,Hyperdrive vs Warp drive. Which is fastest? In...,Is a warp drive faster than a hyperdrive? What...,"In superluminal phase velocities, what is it t...",How can I maximise the amount of scrap I colle...
3,sed to redirect data to a file I am trying to ...,redirect to the same file as the source file p...,How to ensure that string interpolated into `s...,How can I achieve portability with sed -i (in-...
4,Trying to install Ubuntu 13.04 alongside Windo...,How can I repair grub? (How to get Ubuntu back...,Can't access to Ubuntu 13.04 after installing ...,Unable to install windows 7 over ubuntu 14.04....
5,Removing malware injecting into Google Chrome ...,"How can I remove malicious spyware, malware, a...",How to get rid of this peculiarly nasty pop up...,How to remove Offers4U Ads from Chrome on Mac ...
6,Completely brand new to GIS - Help picking tec...,How to Start Web Mapping? I want to start work...,Working with LiDAR data using other than Esri ...,Generic term for spatial data I'm building an ...
7,Labelling the x-axis \begin{tikzpicture} \...,How to prevent pgfplots from using the 10^n no...,"Plotting data seems ""noisy"" I am trying to plo...",Bar chart - Single colored bar For the bar cha...
8,Asymptotically unbiased estimator vs consisten...,What's the difference between asymptotic unbia...,"Equality in an integral estimate Let $u,v\in W...",What is Degrees of Freedom? I have gone throug...
9,Proving summation identities How would one go ...,General identity for a double summation theore...,"Let $n \in \mathbb{Z}^+$, prove the identity $...",Identity for convolution of central binomial c...


In [None]:
dataset_with_negatives.to_pandas().to_csv("stackexchange_duplicates_with_negatives.csv", index=False)

# Restart colab here

#### Split dataset

In [None]:
from datasets import DatasetDict
from datasets import load_dataset

dataset_with_negatives = load_dataset("csv", data_files="stackexchange_duplicates_with_negatives.csv")
train_test_split = dataset_with_negatives['train'].train_test_split(test_size=0.3, seed=42)
test_valid_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

#Restructured sbert_dataset
sbert_dataset = DatasetDict(
    {
      "train" : train_test_split['train'],
      "test" : test_valid_split['train'],
      "validate" : test_valid_split['test']
    }
)

In [None]:
sbert_dataset

DatasetDict({
    train: Dataset({
        features: ['anchor', 'positive', 'negative_1', 'negative_2'],
        num_rows: 2800
    })
    test: Dataset({
        features: ['anchor', 'positive', 'negative_1', 'negative_2'],
        num_rows: 600
    })
    validate: Dataset({
        features: ['anchor', 'positive', 'negative_1', 'negative_2'],
        num_rows: 600
    })
})

#### Training pipeline - wandb api key: 3d9e594db6fdf8e0d50ed596e30b7d54f4a13054

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import MultipleNegativesRankingLoss, CachedMultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from loguru import logger
import torch

# Load a pre-trained sentence transformer model
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")

# model = SentenceTransformer("Alibaba-NLP/gte-base-en-v1.5", #"Alibaba-NLP/gte-large-en-v1.5"
#                             trust_remote_code=True,
#                             device=device
#                             )
savepath="models/gte-en-v1.5-stackexchange-2000/checkpoint-560"
model = SentenceTransformer(savepath,
                            trust_remote_code=True,
                            device=device)

#Define loss function
batch_size = 5
#loss = MultipleNegativesRankingLoss(model) ## Too memory inefficient
loss = CachedMultipleNegativesRankingLoss(model, mini_batch_size=batch_size)

#Training parameters
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/gte-en-v1.5-stackexchange-2000",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=10,
    run_name="gte-en-v1.5-stackexchange-2000",  # Will be used in W&B if `wandb` is installed
)

#Create evaluator and evaluate the base model
logger.info("Evaluating base model on validation dataset...")
dev_evaluator = TripletEvaluator(
    anchors=sbert_dataset["validate"]["anchor"],
    positives=sbert_dataset["validate"]["positive"],
    negatives=sbert_dataset["validate"]["negative_1"],
    name="base-eval-val",
)
base_results = dev_evaluator(model)
logger.info(f"Base model results: {base_results}")

#Create trainer and train
logger.info("Training model...")
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=sbert_dataset["train"].select_columns(["anchor", "positive"]),
    eval_dataset=sbert_dataset["validate"],
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

# Save model
logger.info("Saving model...")
model.save_pretrained("models/gte-en-v1.5-stackexchange-2000/final")

[32m2025-02-05 01:36:58.476[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m16[0m - [1mUsing device: cuda[0m
[32m2025-02-05 01:36:59.691[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m56[0m - [1mEvaluating base model on validation dataset...[0m
[32m2025-02-05 01:37:46.334[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m64[0m - [1mBase model results: {'base-eval-val_cosine_accuracy': 0.6833333333333333, 'base-eval-val_dot_accuracy': 0.3233333333333333, 'base-eval-val_manhattan_accuracy': 0.6783333333333333, 'base-eval-val_euclidean_accuracy': 0.68, 'base-eval-val_max_accuracy': 0.6833333333333333}[0m
[32m2025-02-05 01:37:46.336[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m67[0m - [1mTraining model...[0m
[34m[1mwandb[0m: Currently logged in as: [33mthefunnyonion[0m ([33mthefunnyonion-limindustries[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`

Step,Training Loss,Validation Loss,Base-eval-val Cosine Accuracy,Base-eval-val Dot Accuracy,Base-eval-val Manhattan Accuracy,Base-eval-val Euclidean Accuracy,Base-eval-val Max Accuracy
10,0.0062,1.050973,0.681667,0.321667,0.678333,0.678333,0.681667
20,0.0003,1.052991,0.678333,0.321667,0.678333,0.678333,0.678333
30,0.0114,1.054278,0.676667,0.318333,0.675,0.678333,0.678333
40,0.0002,1.054078,0.676667,0.318333,0.675,0.68,0.68
50,0.0001,1.051784,0.681667,0.318333,0.673333,0.681667,0.681667
60,0.0002,1.036165,0.68,0.318333,0.675,0.681667,0.681667
70,0.0004,1.029793,0.675,0.321667,0.673333,0.678333,0.678333
80,0.0003,1.0259,0.673333,0.326667,0.668333,0.675,0.675
90,0.0001,1.024392,0.676667,0.323333,0.671667,0.678333,0.678333
100,0.0001,1.023098,0.676667,0.32,0.671667,0.68,0.68


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[32m2025-02-05 02:17:21.745[0m | [1mINFO    [0m | [36m__main__[0m:[36m<cell line: 0>[0m:[36m79[0m - [1mSaving model...[0m


In [None]:
# (Optional) Evaluate the trained model on the test set
test_evaluator = TripletEvaluator(
    anchors=sbert_dataset["test"]["anchor"],
    positives=sbert_dataset["test"]["positive"],
    negatives=sbert_dataset["test"]["negative_1"],
    name="base-eval-test",
)
test_evaluator(model)

{'base-eval-test_cosine_accuracy': 0.6733333333333333,
 'base-eval-test_dot_accuracy': 0.3283333333333333,
 'base-eval-test_manhattan_accuracy': 0.6716666666666666,
 'base-eval-test_euclidean_accuracy': 0.675,
 'base-eval-test_max_accuracy': 0.675}