In [1]:
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install datasets wandb huggingface_hub pandas
!pip install -U sentence-transformers
import pandas as pd

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manyli

## Load dataset and encode labels

In [27]:
from datasets import load_dataset, Dataset, DatasetDict

train_dataset = load_dataset("Muhammad2003/routing-dataset", split="train")
train_dataset = train_dataset.rename_column("question", "sentence")

In [28]:
train_dataset.set_format('pandas')
df = train_dataset[:14271]
df['label']=df['label'].astype('category').cat.codes
train_dataset = Dataset.from_pandas(df)
print(train_dataset)

Dataset({
    features: ['sentence', 'label'],
    num_rows: 14271
})


## Load `BAAI/bge-small-en-v1.5` embedding model and `BatchAllTripletLoss`
Loss functions mainly depend on dataset structure, a detailed guide is given on [SentenceTransformers Documentation](https://www.sbert.net/docs/sentence_transformer/loss_overview.html)

In [3]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.losses import BatchAllTripletLoss

# Load a model to train/finetune
model = SentenceTransformer("BAAI/bge-small-en-v1.5")

# Initialize triplet loss
loss = BatchAllTripletLoss(model)


## Intialize WandB

In [4]:
import wandb

#monitering login
wandb.login(key="")
run = wandb.init(project='embed', job_type="training", anonymous="allow")

[34m[1mwandb[0m: Currently logged in as: [33mmuhammadbin-2003[0m ([33mrethinkai[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Setup Training Arguments and start training

In [33]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments, BatchSamplers

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/router-embedding",
    # Optional training parameters:
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=False,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=True,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    # eval_strategy="steps",
    # eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="bge-triplet",  # Will be used in W&B if `wandb` is installed
)

In [34]:
from sentence_transformers import SentenceTransformerTrainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=loss,
    evaluator=None
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [35]:
trainer.train()

Step,Training Loss
100,4.4992
200,4.6653
300,4.4738
400,4.4336
500,4.3558
600,4.4306
700,4.5384
800,4.4286
900,4.3388
1000,4.3651


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1784, training_loss=4.422706672428969, metrics={'train_runtime': 132.4336, 'train_samples_per_second': 215.519, 'train_steps_per_second': 13.471, 'total_flos': 0.0, 'train_loss': 4.422706672428969, 'epoch': 2.0})

## Save the model and push to Hub

In [36]:
model.save_pretrained("models/router-embedding")

In [37]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [40]:
!huggingface-cli login --token ''

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [41]:
model.push_to_hub("Muhammad2003/router-embedding")

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

'https://huggingface.co/Muhammad2003/router-embedding/commit/d08bd7d066d5450ce0311151cbfea6bbea7f2230'

In [42]:
from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("Muhammad2003/router-embedding")
# Run inference
sentences = [
    'How can the person ensure they receive the necessary compensation for their work-related injury?',
    'Is there a law in Oklahoma that restricts the distance of a dispensary to a baseball field?',
    'Considering the complexities of property rights, due process, and public safety, what are the ethical and legal considerations surrounding citizens taking possession of unattended animals in public areas, and how do these actions intersect with constitutional rights and property laws?',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

(3, 384)
torch.Size([3, 3])


In [43]:
embeddings

array([[-0.01034122, -0.11917762,  0.04457978, ...,  0.09624283,
        -0.0318978 , -0.03279097],
       [-0.00842274, -0.11669022,  0.04836053, ...,  0.10686176,
        -0.02774303, -0.03649817],
       [ 0.00247131,  0.11810412, -0.03308529, ..., -0.11345071,
         0.03245148,  0.03161241]], dtype=float32)