In [2]:
import torch

from sentence_transformers import SentenceTransformer, SentenceTransformerModelCardData, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers

from datasets import load_dataset, concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
dataset = load_dataset("AdamLucek/legal-rag-positives-synthetic", split="train")

In [12]:
import pandas as pd 

df = pd.DataFrame(dataset)
df.iloc[[1],:].values

array([[0, 0, 'Sh Synergy, LLC. v. United States', '2023-04-28',
        'United States Court of Federal Claims',
        '1 \n \nIn the United States Court of Federal Claims \nSH SYNERGY, LLC and \nVCH PARTNERS, LLC, \n                         \nPlaintiffs, \n \n \n                                    v. \n \n \nTHE UNITED STATES, \n \nDefendant, \nNos. 22-cv-1466, 22-cv-1468 \n(consolidated) \n \nFiled Under Seal: April 21, 2023 \n \nPublication: April 28, 20231',
        2, 'When was the case released for publication?',
        'Publication: April 28, 2023']], dtype=object)

In [13]:
df.columns

Index(['chunk_id', 'global_chunk_id', 'case_name', 'date_filed', 'court',
       'text', 'question_id', 'question', 'answer_location'],
      dtype='object')

In [14]:
dataset = dataset.rename_column("question", "anchor")
dataset = dataset.rename_column("text", "positive")
dataset = dataset.remove_columns(["chunk_id", "case_name", "date_filed", "court", "question_id", "answer_location"]) # keep global_chunk_id

# Add an id column to the dataset
dataset = dataset.add_column("id", range(len(dataset)))

In [15]:
df = pd.DataFrame(dataset)
df.iloc[[2],:].values

array([[0,
        '1 \n \nIn the United States Court of Federal Claims \nSH SYNERGY, LLC and \nVCH PARTNERS, LLC, \n                         \nPlaintiffs, \n \n \n                                    v. \n \n \nTHE UNITED STATES, \n \nDefendant, \nNos. 22-cv-1466, 22-cv-1468 \n(consolidated) \n \nFiled Under Seal: April 21, 2023 \n \nPublication: April 28, 20231',
        'What are the case numbers associated with this legal matter?',
        2]], dtype=object)

In [16]:
# Shuffle Dataset
dataset = dataset.shuffle()

# Split Dataset Into a 90/10 Train/Test split
dataset = dataset.train_test_split(test_size=0.1)

# Save Datasets to Disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Creating json from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 184.41ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 625.55ba/s]


337794

In [17]:
# Hugging Face model ID
model_id = "nomic-ai/modernbert-embed-base"

# Loading via SentenceTransformer
model = SentenceTransformer(
    model_id, device="cuda" if torch.cuda.is_available() else "cpu")

In [21]:

# Load train and test datasets from their respective JSON files
# These contain pairs of questions (anchors) and text chunks (positives)
test_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])


In [36]:
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)

for i,j in corpus.items():
    print(i, j)
    break
print("-----    ")

for i,j in queries.items():
    print(i, j)
    break

1128 and evaluate price at the IDIQ level.  Oral Arg. Tr. at 79:2–5 (Court: “Well, [GSA doesn’t] 
necessarily have to be locked into [certain contract types] if . . . the solicitation includes cost or 
price.”  Defendant’s Counsel: “[A]t the GWAC level, right.”).  In originally drafting the Polaris 
Solicitations, Defendant rejected this approach.  Defendant has suggested that such an approach
-----    
6097 What benefits does the author have according to the law?


In [57]:
relevant_docs = {}

for id_test, chunk_id_test in zip(test_dataset["id"], test_dataset["global_chunk_id"]):
    relevant_docs[id_test] = []

    for id_corpus, chunk_id_corpus in zip(
        corpus_dataset["id"], corpus_dataset["global_chunk_id"]
    ):
        if chunk_id_test == chunk_id_corpus:
            relevant_docs[id_test].append(id_corpus)


In [59]:
relevant_docs

{6097: [6099, 6098, 6097],
 533: [533],
 457: [457],
 3926: [3927, 3928, 3926],
 590: [591, 590],
 4896: [4897, 4896],
 5409: [5410, 5408, 5411, 5409],
 5845: [5847, 5846, 5845],
 4946: [4945, 4947, 4946],
 3062: [3061, 3063, 3062, 3060],
 675: [676, 674, 675, 673],
 4311: [4312, 4313, 4311],
 5280: [5277, 5280, 5278, 5279],
 3395: [3394, 3392, 3395, 3393],
 384: [386, 385, 384],
 6071: [6072, 6070, 6071],
 4789: [4787, 4788, 4789],
 1357: [1355, 1356, 1357],
 2725: [2726, 2723, 2724, 2725],
 2164: [2164],
 5326: [5327, 5328, 5326],
 1540: [1541, 1543, 1542, 1540],
 472: [470, 471, 472],
 288: [290, 289, 291, 288],
 4195: [4198, 4197, 4196, 4195],
 4207: [4206, 4207, 4208],
 3265: [3264, 3263, 3265],
 4106: [4106],
 966: [968, 965, 967, 966],
 3801: [3799, 3800, 3801],
 1028: [1027, 1025, 1026, 1028],
 4763: [4762, 4764, 4761, 4763],
 3349: [3348, 3349],
 1493: [1491, 1492, 1494, 1493],
 1351: [1352, 1351, 1354, 1353],
 2295: [2294, 2296, 2297, 2295],
 61: [62, 60, 59, 61],
 4036: [403

In [None]:
test_dataset

Dataset({
    features: ['global_chunk_id', 'positive', 'anchor', 'id'],
    num_rows: 647
})

In [None]:
corpus_dataset

Dataset({
    features: ['global_chunk_id', 'positive', 'anchor', 'id'],
    num_rows: 6469
})

In [60]:
# Dimensions of interest
matryoshka_dimensions = [768, 512, 256, 128, 64] # Important: large to small

# Create empty list to hold evaluators
matryoshka_evaluators = []

# Create an evaluator for each above dimension
for dim in matryoshka_dimensions:
    # Define the evaluator
    ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        name=f"dim_{dim}",
        truncate_dim=dim,  # Truncate the embeddings to the respective dimension
        score_functions={"cosine": cos_sim},
    )
    # Add to list
    matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
# Able to run all our dimension specific InformationRetrievalEvaluators sequentially.
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [64]:
matryoshka_evaluators

[<sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x22246828bd0>,
 <sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x2226ef3d550>,
 <sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x2226efd5990>,
 <sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x2226ef51e10>,
 <sentence_transformers.evaluation.InformationRetrievalEvaluator.InformationRetrievalEvaluator at 0x2226ef53e50>]

In [62]:
# Evaluate the model
base_results = evaluator(model)

# Print header
print("\nBase Model Evaluation Results")
print("-" * 85)
print(f"{'Metric':15} {'768d':>12} {'512d':>12} {'256d':>12} {'128d':>12} {'64d':>12}")
print("-" * 85)

# List of metrics to display
metrics = [
    'ndcg@10',
    'mrr@10',
    'map@100',
    'accuracy@1',
    'accuracy@3',
    'accuracy@5',
    'accuracy@10',
    'precision@1',
    'precision@3',
    'precision@5',
    'precision@10',
    'recall@1',
    'recall@3',
    'recall@5',
    'recall@10'
]

# Print each metric
for metric in metrics:
    values = []
    for dim in matryoshka_dimensions:
        key = f"dim_{dim}_cosine_{metric}"
        values.append(base_results[key])

    # Highlight NDCG@10
    metric_name = f"=={metric}==" if metric == "ndcg@10" else metric
    print(f"{metric_name:15}", end="  ")
    for val in values:
        print(f"{val:12.4f}", end=" ")
    print()

# Print sequential score
print("-" * 85)
print(f"{'seq_score:'} {base_results['sequential_score']:1f}")

KeyboardInterrupt: 