In [1]:
import torch

from sentence_transformers import SentenceTransformer, SentenceTransformerModelCardData, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers

from datasets import load_dataset, concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("AdamLucek/legal-rag-positives-synthetic", split="train")

In [3]:
import pandas as pd 

df = pd.DataFrame(dataset)
df.iloc[[1],:].values

array([[0, 0, 'Sh Synergy, LLC. v. United States', '2023-04-28',
        'United States Court of Federal Claims',
        '1 \n \nIn the United States Court of Federal Claims \nSH SYNERGY, LLC and \nVCH PARTNERS, LLC, \n                         \nPlaintiffs, \n \n \n                                    v. \n \n \nTHE UNITED STATES, \n \nDefendant, \nNos. 22-cv-1466, 22-cv-1468 \n(consolidated) \n \nFiled Under Seal: April 21, 2023 \n \nPublication: April 28, 20231',
        2, 'When was the case released for publication?',
        'Publication: April 28, 2023']], dtype=object)

In [4]:
df.columns

Index(['chunk_id', 'global_chunk_id', 'case_name', 'date_filed', 'court',
       'text', 'question_id', 'question', 'answer_location'],
      dtype='object')

In [5]:
dataset = dataset.rename_column("question", "anchor")
dataset = dataset.rename_column("text", "positive")
dataset = dataset.remove_columns(["chunk_id", "case_name", "date_filed", "court", "question_id", "answer_location"]) # keep global_chunk_id

# Add an id column to the dataset
dataset = dataset.add_column("id", range(len(dataset)))

In [6]:
df = pd.DataFrame(dataset)
df.iloc[[2],:].values

array([[0,
        '1 \n \nIn the United States Court of Federal Claims \nSH SYNERGY, LLC and \nVCH PARTNERS, LLC, \n                         \nPlaintiffs, \n \n \n                                    v. \n \n \nTHE UNITED STATES, \n \nDefendant, \nNos. 22-cv-1466, 22-cv-1468 \n(consolidated) \n \nFiled Under Seal: April 21, 2023 \n \nPublication: April 28, 20231',
        'What are the case numbers associated with this legal matter?',
        2]], dtype=object)

In [7]:
# Shuffle Dataset
dataset = dataset.shuffle()

# Split Dataset Into a 90/10 Train/Test split
dataset = dataset.train_test_split(test_size=0.1)

# Save Datasets to Disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Creating json from Arrow format: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6/6 [00:00<00:00, 197.12ba/s]
Creating json from Arrow format: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<?, ?ba/s]


338495

In [8]:
# Hugging Face model ID
model_id = "nomic-ai/modernbert-embed-base"

# Loading via SentenceTransformer
model = SentenceTransformer(
    model_id, device="cuda" if torch.cuda.is_available() else "cpu")

In [9]:

# Load train and test datasets from their respective JSON files
# These contain pairs of questions (anchors) and text chunks (positives)
test_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])


Generating train split: 647 examples [00:00, 40431.39 examples/s]
Generating train split: 5822 examples [00:00, 493477.44 examples/s]


In [10]:
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)

for i,j in corpus.items():
    print(i, j)
    break
print("-----    ")

for i,j in queries.items():
    print(i, j)
    break

1985 from her; that other classmates (but not plaintiff) publicly exposed the allegations against 
defendant on social media; that, in July 2022, defendant filed a defamation lawsuit against 
several classmates (but not plaintiff); and that, since filing suit, defendant threatened to add 
plaintiff to his defamation suit.2 In support of the last allegation, plaintiff attached a letter from
-----    
2782 What is the sequence position of the discussion about the plaintiff's claims on the CIA's refusal to process certain FOIA requests?


In [11]:
relevant_docs = {}

for id_test, chunk_id_test in zip(test_dataset["id"], test_dataset["global_chunk_id"]):
    relevant_docs[id_test] = []

    for id_corpus, chunk_id_corpus in zip(
        corpus_dataset["id"], corpus_dataset["global_chunk_id"]
    ):
        if chunk_id_test == chunk_id_corpus:
            relevant_docs[id_test].append(id_corpus)


In [12]:
relevant_docs

{2782: [2783, 2781, 2782],
 2531: [2532, 2531],
 5359: [5358, 5356, 5359, 5357],
 1817: [1814, 1815, 1816, 1817],
 2273: [2272, 2274, 2275, 2273],
 6109: [6108, 6109, 6107],
 2812: [2810, 2811, 2812],
 6191: [6190, 6191],
 3762: [3763, 3761, 3762],
 859: [858, 857, 859],
 716: [715, 714, 716],
 1866: [1869, 1868, 1867, 1866],
 5103: [5103],
 5223: [5222, 5223],
 5306: [5308, 5305, 5307, 5306],
 3249: [3248, 3247, 3249],
 602: [599, 601, 600, 602],
 6107: [6108, 6109, 6107],
 3776: [3775, 3776],
 4052: [4050, 4051, 4052],
 2519: [2517, 2516, 2518, 2519],
 3658: [3657, 3656, 3659, 3658],
 766: [765, 767, 764, 766],
 3312: [3310, 3311, 3312],
 221: [219, 220, 222, 221],
 1297: [1298, 1299, 1297],
 1358: [1360, 1358, 1359],
 1467: [1469, 1470, 1468, 1467],
 1927: [1928, 1927],
 4711: [4712, 4713, 4714, 4711],
 5357: [5358, 5356, 5359, 5357],
 3682: [3681, 3682],
 2614: [2612, 2613, 2615, 2614],
 5192: [5192],
 77: [78, 77],
 288: [290, 291, 289, 288],
 4567: [4564, 4565, 4566, 4567],
 719:

In [13]:
test_dataset

Dataset({
    features: ['global_chunk_id', 'positive', 'anchor', 'id'],
    num_rows: 647
})

In [14]:
corpus_dataset

Dataset({
    features: ['global_chunk_id', 'positive', 'anchor', 'id'],
    num_rows: 6469
})

In [15]:
# Dimensions of interest
matryoshka_dimensions = [768, 512, 256, 128, 64] # Important: large to small

# Create empty list to hold evaluators
matryoshka_evaluators = []

# Create an evaluator for each above dimension
for dim in matryoshka_dimensions:
    # Define the evaluator
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to the respective dimension
        score_functions={"cosine": cos_sim},
    )
    # Add to list
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
# Able to run all our dimension specific InformationRetrievalEvaluators sequentially.
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [17]:
matryoshka_evaluators

[<sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x28b2ae91210>,
 <sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x28b2adef7d0>,
 <sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x28b2adeef10>,
 <sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x28b2bd860d0>,
 <sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x28b2ae9d0d0>]

In [18]:
# Evaluate the model
base_results = evaluator(model)

# Print header
print("\nBase Model Evaluation Results")
print("-" * 85)
print(f"{'Metric':15} {'768d':>12} {'512d':>12} {'256d':>12} {'128d':>12} {'64d':>12}")
print("-" * 85)

# List of metrics to display
metrics = [
    'ndcg@10',
    'mrr@10',
    'map@100',
    'accuracy@1',
    'accuracy@3',
    'accuracy@5',
    'accuracy@10',
    'precision@1',
    'precision@3',
    'precision@5',
    'precision@10',
    'recall@1',
    'recall@3',
    'recall@5',
    'recall@10'
]

# Print each metric
for metric in metrics:
    values = []
    for dim in matryoshka_dimensions:
        key = f"dim_{dim}_cosine_{metric}"
        values.append(base_results[key])

    # Highlight NDCG@10
    metric_name = f"=={metric}==" if metric == "ndcg@10" else metric
    print(f"{metric_name:15}", end="  ")
    for val in values:
        print(f"{val:12.4f}", end=" ")
    print()

# Print sequential score
print("-" * 85)
print(f"{'seq_score:'} {base_results['sequential_score']:1f}")


Base Model Evaluation Results
-------------------------------------------------------------------------------------
Metric                  768d         512d         256d         128d          64d
-------------------------------------------------------------------------------------
==ndcg@10==            0.4291       0.4233       0.4075       0.3679       0.2538 
mrr@10                 0.3758       0.3717       0.3553       0.3176       0.2151 
map@100                0.4205       0.4153       0.3987       0.3586       0.2531 
accuracy@1             0.3323       0.3261       0.3107       0.2751       0.1839 
accuracy@3             0.3740       0.3802       0.3586       0.3184       0.2117 
accuracy@5             0.4467       0.4513       0.4359       0.3833       0.2658 
accuracy@10            0.5348       0.5178       0.5085       0.4699       0.3338 
precision@1            0.3323       0.3261       0.3107       0.2751       0.1839 
precision@3            0.3148       0.3153       0.2

In [31]:
base_results 


{'dim_768_cosine_accuracy@1': 0.3323029366306028,
 'dim_768_cosine_accuracy@3': 0.3740340030911901,
 'dim_768_cosine_accuracy@5': 0.446676970633694,
 'dim_768_cosine_accuracy@10': 0.5347758887171561,
 'dim_768_cosine_precision@1': 0.3323029366306028,
 'dim_768_cosine_precision@3': 0.31478619268418345,
 'dim_768_cosine_precision@5': 0.24574961360123648,
 'dim_768_cosine_precision@10': 0.15919629057187018,
 'dim_768_cosine_recall@1': 0.12429160226687273,
 'dim_768_cosine_recall@3': 0.3205821741370427,
 'dim_768_cosine_recall@5': 0.4045595054095827,
 'dim_768_cosine_recall@10': 0.51854714064915,
 'dim_768_cosine_ndcg@10': 0.4291462710200361,
 'dim_768_cosine_mrr@10': 0.37576298422511695,
 'dim_768_cosine_map@100': 0.4205064770071176,
 'dim_512_cosine_accuracy@1': 0.3261205564142195,
 'dim_512_cosine_accuracy@3': 0.3802163833075734,
 'dim_512_cosine_accuracy@5': 0.45131375579598143,
 'dim_512_cosine_accuracy@10': 0.517774343122102,
 'dim_512_cosine_precision@1': 0.3261205564142195,
 'dim_5

In [19]:
# load model with SDPA for using Flash Attention 2
model = SentenceTransformer(
    model_id,
    model_kwargs={"attn_implementation": "sdpa"},
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="ModernBERT Embed base Legal Matryoshka",
    ),
)

In [21]:
# Initial Loss
base_loss = MultipleNegativesRankingLoss(model)

# Matryoshka Loss Wrapper
train_loss = MatryoshkaLoss(
    model, base_loss, matryoshka_dims=matryoshka_dimensions
)

In [22]:
args = SentenceTransformerTrainingArguments(
    output_dir="modernbert-embed-base-legal-matryoshka-lucek", # output directory and hugging face model ID
    num_train_epochs=4,                                        # number of epochs
    per_device_train_batch_size=32,                            # train batch size
    gradient_accumulation_steps=16,                            # for a global batch size of 512
    per_device_eval_batch_size=16,                             # evaluation batch size
    warmup_ratio=0.1,                                          # warmup ratio
    learning_rate=2e-5,                                        # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",                                # use cosine learning rate scheduler
    optim="adamw_torch_fused",                                 # use fused adamw optimizer
    tf32=True,                                                 # use tf32 precision
    bf16=True,                                                 # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,                 # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="epoch",                                     # evaluate after each epoch
    save_strategy="epoch",                                     # save after each epoch
    logging_steps=10,                                          # log every 10 steps
    save_total_limit=3,                                        # save only the last 3 models
    load_best_model_at_end=True,                               # load the best model when training ends
    metric_for_best_model="eval_dim_128_cosine_ndcg@10",       # Optimizing for the best ndcg@10 score for the 128 dimension
    report_to="none"                                           # Turning off training logging for now, input 'wandb' etc. if desired.
)

In [23]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset.select_columns(
        ["positive", "anchor"]
    ),  # training dataset
    loss=train_loss,
    evaluator=evaluator,
)

                                                                     

In [24]:
# Start training
trainer.train()

# Save the best model based on our eval_dim_128_cosine_ndcg@10 criteria
trainer.save_model()

Column 'anchor' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Epoch,Training Loss,Validation Loss,Dim 768 Cosine Accuracy@1,Dim 768 Cosine Accuracy@3,Dim 768 Cosine Accuracy@5,Dim 768 Cosine Accuracy@10,Dim 768 Cosine Precision@1,Dim 768 Cosine Precision@3,Dim 768 Cosine Precision@5,Dim 768 Cosine Precision@10,Dim 768 Cosine Recall@1,Dim 768 Cosine Recall@3,Dim 768 Cosine Recall@5,Dim 768 Cosine Recall@10,Dim 768 Cosine Ndcg@10,Dim 768 Cosine Mrr@10,Dim 768 Cosine Map@100,Dim 512 Cosine Accuracy@1,Dim 512 Cosine Accuracy@3,Dim 512 Cosine Accuracy@5,Dim 512 Cosine Accuracy@10,Dim 512 Cosine Precision@1,Dim 512 Cosine Precision@3,Dim 512 Cosine Precision@5,Dim 512 Cosine Precision@10,Dim 512 Cosine Recall@1,Dim 512 Cosine Recall@3,Dim 512 Cosine Recall@5,Dim 512 Cosine Recall@10,Dim 512 Cosine Ndcg@10,Dim 512 Cosine Mrr@10,Dim 512 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100,Dim 64 Cosine Accuracy@1,Dim 64 Cosine Accuracy@3,Dim 64 Cosine Accuracy@5,Dim 64 Cosine Accuracy@10,Dim 64 Cosine Precision@1,Dim 64 Cosine Precision@3,Dim 64 Cosine Precision@5,Dim 64 Cosine Precision@10,Dim 64 Cosine Recall@1,Dim 64 Cosine Recall@3,Dim 64 Cosine Recall@5,Dim 64 Cosine Recall@10,Dim 64 Cosine Ndcg@10,Dim 64 Cosine Mrr@10,Dim 64 Cosine Map@100,Sequential Score
1,5.7589,No log,0.514683,0.561051,0.638331,0.710974,0.514683,0.479134,0.365379,0.215611,0.192942,0.484802,0.597115,0.697965,0.612262,0.558969,0.599847,0.50541,0.537867,0.627512,0.695518,0.50541,0.463163,0.35425,0.210665,0.190881,0.469603,0.579727,0.682895,0.598293,0.546265,0.587033,0.469861,0.514683,0.595054,0.667697,0.469861,0.437403,0.336631,0.201855,0.177615,0.445131,0.55474,0.654688,0.568178,0.514278,0.555952,0.369397,0.42813,0.508501,0.587326,0.369397,0.355487,0.28408,0.175425,0.136914,0.359222,0.470505,0.569552,0.476387,0.419598,0.466336,0.282844,0.315301,0.386399,0.457496,0.282844,0.265842,0.212056,0.13524,0.108192,0.271767,0.353297,0.446548,0.367233,0.320502,0.362774,0.367233
2,2.6609,No log,0.53323,0.579598,0.659969,0.751159,0.53323,0.499742,0.379907,0.225348,0.197063,0.502318,0.621329,0.730938,0.63673,0.580816,0.620899,0.530139,0.574961,0.650696,0.72643,0.530139,0.49253,0.372798,0.219165,0.19848,0.499614,0.60948,0.712004,0.626613,0.574242,0.614476,0.506955,0.550232,0.633694,0.706337,0.506955,0.470891,0.358578,0.214529,0.189979,0.478104,0.585394,0.693328,0.605691,0.551,0.591829,0.42813,0.474498,0.561051,0.633694,0.42813,0.400309,0.314065,0.190881,0.161386,0.405719,0.514683,0.613859,0.528075,0.474082,0.519064,0.33694,0.370943,0.435858,0.513138,0.33694,0.314271,0.243277,0.150232,0.129315,0.322257,0.404946,0.49678,0.420035,0.375194,0.416844,0.420035
3,1.8268,No log,0.539413,0.585781,0.667697,0.751159,0.539413,0.503864,0.384235,0.22643,0.199897,0.505925,0.627254,0.734029,0.64186,0.586375,0.626893,0.536321,0.579598,0.661515,0.72643,0.536321,0.498712,0.380216,0.220093,0.199897,0.503478,0.621458,0.714967,0.631698,0.580273,0.620564,0.50541,0.55796,0.639876,0.704791,0.50541,0.475013,0.362287,0.214992,0.189722,0.48287,0.591061,0.696033,0.608507,0.552717,0.595465,0.443586,0.493045,0.578053,0.650696,0.443586,0.414735,0.325502,0.196291,0.167053,0.420659,0.5322,0.632277,0.545431,0.490539,0.534531,0.341577,0.370943,0.440495,0.511592,0.341577,0.315301,0.242968,0.151777,0.132406,0.324446,0.4034,0.497424,0.423429,0.378466,0.422078,0.423429
4,1.6484,No log,0.540958,0.585781,0.667697,0.752705,0.540958,0.504894,0.384544,0.226893,0.200283,0.506698,0.628156,0.735317,0.64309,0.587675,0.627867,0.534776,0.573416,0.659969,0.727975,0.534776,0.49459,0.378362,0.220402,0.199639,0.499742,0.619011,0.718058,0.631477,0.57859,0.618955,0.500773,0.55796,0.644513,0.704791,0.500773,0.471922,0.363833,0.214992,0.188305,0.479907,0.592993,0.696033,0.607305,0.55015,0.593768,0.440495,0.485317,0.581144,0.652241,0.440495,0.410098,0.323648,0.196291,0.166023,0.416151,0.530526,0.633694,0.544116,0.48799,0.532014,0.340031,0.37558,0.446677,0.511592,0.340031,0.316332,0.246059,0.152396,0.132148,0.326121,0.408423,0.498583,0.424785,0.378844,0.423376,0.424785


In [25]:
trainer.model.push_to_hub("modernbert-embed-base-legal-matryoshka-2")

model.safetensors: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 596M/596M [02:17<00:00, 4.34MB/s]   


'https://huggingface.co/Sri1999/modernbert-embed-base-legal-matryoshka-2/commit/15ef4243b9bfaac9905d54b4468de217bc1c3fe2'

In [26]:
fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)

# Evaluate the model
ft_results = evaluator(fine_tuned_model)

# Print header
print("Fine Tuned Model Evaluation Results")
print("-" * 85)
print(f"{'Metric':15} {'768d':>12} {'512d':>12} {'256d':>12} {'128d':>12} {'64d':>12}")
print("-" * 85)

# List of metrics to display
metrics = [
    'ndcg@10',
    'mrr@10',
    'map@100',
    'accuracy@1',
    'accuracy@3',
    'accuracy@5',
    'accuracy@10',
    'precision@1',
    'precision@3',
    'precision@5',
    'precision@10',
    'recall@1',
    'recall@3',
    'recall@5',
    'recall@10'
]

# Print each metric
for metric in metrics:
    values = []
    for dim in matryoshka_dimensions:
        key = f"dim_{dim}_cosine_{metric}"
        values.append(ft_results[key])

    # Highlight NDCG@10
    metric_name = f"=={metric}==" if metric == "ndcg@10" else metric
    print(f"{metric_name:15}", end="  ")
    for val in values:
        print(f"{val:12.4f}", end=" ")
    print()

# Print sequential score
print("-" * 85)
print(f"{'seq_score:'} {ft_results['sequential_score']:1f}")

Fine Tuned Model Evaluation Results
-------------------------------------------------------------------------------------
Metric                  768d         512d         256d         128d          64d
-------------------------------------------------------------------------------------
==ndcg@10==            0.6418       0.6292       0.6080       0.5446       0.4246 
mrr@10                 0.5866       0.5779       0.5501       0.4886       0.3791 
map@100                0.6274       0.6184       0.5935       0.5327       0.4229 
accuracy@1             0.5394       0.5348       0.5008       0.4420       0.3416 
accuracy@3             0.5873       0.5765       0.5564       0.4869       0.3740 
accuracy@5             0.6708       0.6569       0.6461       0.5750       0.4436 
accuracy@10            0.7496       0.7233       0.7063       0.6507       0.5116 
precision@1            0.5394       0.5348       0.5008       0.4420       0.3416 
precision@3            0.5044       0.4961     

In [27]:
from sentence_transformers import SentenceTransformer

# Download from the ðŸ¤— Hub
model = SentenceTransformer("AdamLucek/ModernBERT-embed-base-legal-MRL", truncate_dim=256)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [28]:
sentences = [
    'Which organization is Carmody Gaba Daman associated with?',
    'Assistant General Counsel, U.S. General Services Administration, Washington, D.C.; Carmody Gaba Daman, Assistant General Counsel, U.S. General Services Administration, Washington, D.C.; Michael Blumenthal, Trial Attorney, U.S. Small Business Administration, Office of General Counsel, Washington, D.C. MEMORANDUM AND ORDER', # Corresponding Positive
    'certain Solicitation requirements violate federal procurement statutes and agency regulations governing procurements involving small business offerors. See generally SHS MJAR at 14; VCH MJAR at 14. Having considered the partiesâ€™ arguments, applicable law, and the Administrative Record, this Court GRANTS in part and DENIES in part Plaintiffsâ€™ Motions for Judgment on the', # Random Excerpt
]

embeddings = model.encode(sentences)
print(embeddings.shape)

BackendCompilerFailed: backend='inductor' raised:
RuntimeError: Cannot find a working triton installation. Either the package is not installed or it is too old. More information on installing Triton can be found at https://github.com/openai/triton

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True


In [None]:
# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities[0])