In [2]:
import dspy
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
lm = dspy.OpenAI(
    model="gpt-4o",
    max_tokens = 1000
)
dspy.settings.configure(lm=lm)

In [4]:
df = pd.read_csv("hf://datasets/darrow-ai/LegalLensNLI-SharedTask/NLI.csv")

In [5]:
df.head()

Unnamed: 0,premise,hypothesis,legal_act,label,Unnamed: 4
0,DEFENDANT has reached a settlement in a class ...,Had to visit DEFENDANT a while back for some r...,privacy,Neutral,
1,A class action lawsuit has been certified agai...,"So, at 22, I was into this whole ""collect-and-...",consumer_protection,Entailed,
2,"DEFENDANT, an auto parts supplier, has agreed ...",As an employee of the aforementioned auto part...,consumer_protection,Contradict,
3,"DEFENDANT has agreed to pay $400,000 to settle...","Hey, got a call from DEFENDANT a while back, s...",privacy,Contradict,
4,DEFENDANT and other health benefit companies h...,"Just checked my mail, got a letter from DEFEND...",privacy,Neutral,


In [6]:
# Remove unnecessary columns
df = df[["premise", "hypothesis", "label"]]

In [7]:
df

Unnamed: 0,premise,hypothesis,label
0,DEFENDANT has reached a settlement in a class ...,Had to visit DEFENDANT a while back for some r...,Neutral
1,A class action lawsuit has been certified agai...,"So, at 22, I was into this whole ""collect-and-...",Entailed
2,"DEFENDANT, an auto parts supplier, has agreed ...",As an employee of the aforementioned auto part...,Contradict
3,"DEFENDANT has agreed to pay $400,000 to settle...","Hey, got a call from DEFENDANT a while back, s...",Contradict
4,DEFENDANT and other health benefit companies h...,"Just checked my mail, got a letter from DEFEND...",Neutral
...,...,...,...
307,DEFENDANT has reached a settlement in a breach...,Feeling a bit perplexed today. I've been a loy...,Entailed
308,"DEFENDANT, a seafood restaurant operator in Ca...",Had a fantastic seafood dinner at this place l...,Neutral
309,Consumers who received promotional text messag...,Hardly ever use my phone for anything other th...,Neutral
310,"DEFENDANT, a restaurant point-of-sale provider...","Upon my daily visits to the local diner, I fre...",Entailed


In [8]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, test_size=0.3, random_state=42)

In [9]:
train_df

Unnamed: 0,premise,hypothesis,label
101,A settlement has been reached in a class actio...,"Having a blast with my computer, Wi-Fi's been ...",Contradict
193,DEFENDANT Aviation Services has agreed to pay ...,Been working at DEFENDANT Aviation Services fo...,Neutral
72,DEFENDANT-A and DEFENDANT-B Inc. have agreed t...,So I've been using this DEFENDANT-A software f...,Neutral
298,"DEFENDANT, an HR company that provides timekee...",Anyone else used those UKG time clocks at work...,Entailed
15,"DEFENDANT, a manufacturing company, has agreed...","Alright guys, remember that job I had at that ...",Entailed
...,...,...,...
188,"DEFENDANT, has agreed to pay $16 million to se...","Been using DEFENDANT for quite a while now, an...",Contradict
71,DEFENDANT has settled a class action lawsuit o...,Been working at this company for a while now a...,Contradict
106,DEFENDANT and its franchisee have agreed to es...,Despite the numerous phone calls I've received...,Contradict
270,DEFENDANT has agreed to pay $7.2 million to se...,"So, I've been getting a couple of calls from D...",Neutral


In [10]:
eval_df

Unnamed: 0,premise,hypothesis,label
228,DEFENDANT has agreed to a $5.25 million settle...,As a regular visitor to a certain company's fa...,Entailed
9,The DEFENDANT Text Message Class Action Settle...,Been receiving way too many texts from DEFENDA...,Entailed
57,DEFENDANT has agreed to pay $7.5 million to se...,Stumbled upon my former employer in the news t...,Contradict
60,"DEFENDANT, a hospital in Dixon, Illinois, has ...","So, there's this hospital in Dixon I went to a...",Neutral
25,"DEFENDANT, a company that provides ambulance a...",Recently started using the handprint clock-in ...,Neutral
...,...,...,...
304,A verdict has been reached against DEFENDANT f...,"It's rather interesting, I've been using DEFEN...",Neutral
19,"DEFENDANT, a home healthcare services company,...","Hey, folks! So, I've been using this home heal...",Neutral
147,DEFENDANT has agreed to a $12.75 million settl...,Feeling quite content with my employment situa...,Contradict
92,"DEFENDANT has agreed to pay $975,000 to settle...","Hey folks, I've been getting these calls from ...",Contradict


In [11]:
train_dataset = []
for premise, hypothesis, label in train_df.values:
    train_dataset.append(dspy.Example(premise=premise, hypothesis=hypothesis, label=label).with_inputs("premise", "hypothesis"))

In [12]:
eval_dataset = []
for premise, hypothesis, label in eval_df.values:
    eval_dataset.append(dspy.Example(premise=premise, hypothesis=hypothesis, label=label).with_inputs("premise", "hypothesis"))

In [13]:
from typing import Literal
class NLI(dspy.Signature):
    """
    Please classify the relationship between a legal premise and a hypothesis into one of three categories: Entailed, Contradict, Neutral.
    """
    premise: str = dspy.InputField()
    hypothesis: str = dspy.InputField()
    label: Literal["Entailed", "Contradict", "Neutral"] = dspy.OutputField()

In [14]:
predictor = dspy.TypedChainOfThought(NLI)

In [15]:
predictor(premise=train_dataset[0].premise, hypothesis=train_dataset[0].hypothesis)

Prediction(
    reasoning='```json\n{\n  "value": "Neutral"\n}\n```',
    label='Neutral'
)

In [16]:
def label_match(example, pred, trace=None):
    if example.label == pred.label:
        return True
    
    return False

In [17]:
from dspy.teleprompt import BootstrapFewShotWithRandomSearch

optimizer = BootstrapFewShotWithRandomSearch(metric=label_match)
optimized_predictor = optimizer.compile(predictor, trainset=train_dataset)

Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


Average Metric: 5 / 8  (62.5):   3%|▎         | 7/218 [00:00<00:00, 471.22it/s] ERROR:dspy.evaluate.evaluate:[2m2024-08-28T23:48:34.010722Z[0m [[31m[1merror    [0m] [1mError for example in dev set: 		 ('Too many retries trying to get the correct output format. Try simplifying the requirements.', {'label': "ValueError('json output should start and end with { and }')"})[0m [[0m[1m[34mdspy.evaluate.evaluate[0m][0m [36mfilename[0m=[35mevaluate.py[0m [36mlineno[0m=[35m180[0m
Average Metric: 106.0 / 218  (48.6): 100%|██████████| 218/218 [00:00<00:00, 262.16it/s]


Score: 48.62 for set: [0]
New best sscore: 48.62 for seed -3
Scores so far: [48.62]
Best score: 48.62


Average Metric: 164 / 218  (75.2): 100%|██████████| 218/218 [00:00<00:00, 240.82it/s]


Score: 75.23 for set: [16]
New best sscore: 75.23 for seed -2
Scores so far: [48.62, 75.23]
Best score: 75.23


  3%|▎         | 6/218 [00:00<00:00, 339.95it/s]


Bootstrapped 4 full traces after 7 examples in round 0.


Average Metric: 176 / 218  (80.7): 100%|██████████| 218/218 [00:00<00:00, 259.42it/s]


Score: 80.73 for set: [16]
New best sscore: 80.73 for seed -1
Scores so far: [48.62, 75.23, 80.73]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8394495412844036
Average of max per entry across top 3 scores: 0.9403669724770642
Average of max per entry across top 5 scores: 0.9403669724770642
Average of max per entry across top 8 scores: 0.9403669724770642
Average of max per entry across top 9999 scores: 0.9403669724770642


  2%|▏         | 5/218 [00:00<00:00, 224.02it/s]


Bootstrapped 4 full traces after 6 examples in round 0.


Average Metric: 167 / 218  (76.6): 100%|██████████| 218/218 [00:00<00:00, 233.77it/s]


Score: 76.61 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8440366972477065
Average of max per entry across top 3 scores: 0.8532110091743119
Average of max per entry across top 5 scores: 0.9495412844036697
Average of max per entry across top 8 scores: 0.9495412844036697
Average of max per entry across top 9999 scores: 0.9495412844036697


  1%|          | 2/218 [00:00<00:02, 77.31it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 153 / 218  (70.2): 100%|██████████| 218/218 [00:00<00:00, 228.03it/s]


Score: 70.18 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8440366972477065
Average of max per entry across top 3 scores: 0.8532110091743119
Average of max per entry across top 5 scores: 0.9495412844036697
Average of max per entry across top 8 scores: 0.9495412844036697
Average of max per entry across top 9999 scores: 0.9495412844036697


  1%|▏         | 3/218 [00:00<00:01, 169.86it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 165 / 218  (75.7): 100%|██████████| 218/218 [00:00<00:00, 253.26it/s]


Score: 75.69 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8440366972477065
Average of max per entry across top 3 scores: 0.8623853211009175
Average of max per entry across top 5 scores: 0.8715596330275229
Average of max per entry across top 8 scores: 0.963302752293578
Average of max per entry across top 9999 scores: 0.963302752293578


  1%|▏         | 3/218 [00:00<00:01, 205.97it/s]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 158 / 218  (72.5): 100%|██████████| 218/218 [00:01<00:00, 189.45it/s]


Score: 72.48 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8440366972477065
Average of max per entry across top 3 scores: 0.8623853211009175
Average of max per entry across top 5 scores: 0.8807339449541285
Average of max per entry across top 8 scores: 0.9724770642201835
Average of max per entry across top 9999 scores: 0.9724770642201835


  1%|          | 2/218 [00:00<00:00, 219.11it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 171 / 218  (78.4): 100%|██████████| 218/218 [00:00<00:00, 253.55it/s]


Score: 78.44 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.9724770642201835
Average of max per entry across top 9999 scores: 0.9724770642201835


  2%|▏         | 4/218 [00:00<00:01, 209.95it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 164 / 218  (75.2): 100%|██████████| 218/218 [00:01<00:00, 208.89it/s]


Score: 75.23 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.8944954128440367
Average of max per entry across top 9999 scores: 0.9770642201834863


  1%|▏         | 3/218 [00:00<00:00, 216.88it/s]


Bootstrapped 1 full traces after 4 examples in round 0.


Average Metric: 134 / 218  (61.5): 100%|██████████| 218/218 [00:00<00:00, 252.25it/s]


Score: 61.47 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.8944954128440367
Average of max per entry across top 9999 scores: 0.9770642201834863


  1%|▏         | 3/218 [00:00<00:01, 192.39it/s]


Bootstrapped 3 full traces after 4 examples in round 0.


Average Metric: 158 / 218  (72.5): 100%|██████████| 218/218 [00:00<00:00, 256.94it/s]


Score: 72.48 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47, 72.48]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.8990825688073395
Average of max per entry across top 9999 scores: 0.9770642201834863


  1%|          | 2/218 [00:00<00:01, 189.63it/s]


Bootstrapped 2 full traces after 3 examples in round 0.


Average Metric: 154 / 218  (70.6): 100%|██████████| 218/218 [00:01<00:00, 211.43it/s]


Score: 70.64 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47, 72.48, 70.64]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.8990825688073395
Average of max per entry across top 9999 scores: 0.9770642201834863


  3%|▎         | 7/218 [00:00<00:00, 225.18it/s]


Bootstrapped 4 full traces after 8 examples in round 0.


Average Metric: 159 / 218  (72.9): 100%|██████████| 218/218 [00:00<00:00, 256.30it/s]


Score: 72.94 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47, 72.48, 70.64, 72.94]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.8990825688073395
Average of max per entry across top 9999 scores: 0.981651376146789


  0%|          | 1/218 [00:00<00:01, 196.38it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 158 / 218  (72.5): 100%|██████████| 218/218 [00:00<00:00, 250.75it/s]


Score: 72.48 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47, 72.48, 70.64, 72.94, 72.48]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.8990825688073395
Average of max per entry across top 9999 scores: 0.981651376146789


  2%|▏         | 4/218 [00:00<00:01, 142.34it/s]


Bootstrapped 4 full traces after 5 examples in round 0.


Average Metric: 150 / 218  (68.8): 100%|██████████| 218/218 [00:01<00:00, 193.23it/s]


Score: 68.81 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47, 72.48, 70.64, 72.94, 72.48, 68.81]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.8990825688073395
Average of max per entry across top 9999 scores: 0.981651376146789


  3%|▎         | 6/218 [00:00<00:00, 220.97it/s]


Bootstrapped 4 full traces after 7 examples in round 0.


Average Metric: 159 / 218  (72.9): 100%|██████████| 218/218 [00:00<00:00, 258.12it/s]


Score: 72.94 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47, 72.48, 70.64, 72.94, 72.48, 68.81, 72.94]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.9036697247706422
Average of max per entry across top 9999 scores: 0.981651376146789


  2%|▏         | 4/218 [00:00<00:01, 207.67it/s]


Bootstrapped 3 full traces after 5 examples in round 0.


Average Metric: 153 / 218  (70.2): 100%|██████████| 218/218 [00:00<00:00, 255.32it/s]


Score: 70.18 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47, 72.48, 70.64, 72.94, 72.48, 68.81, 72.94, 70.18]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.9036697247706422
Average of max per entry across top 9999 scores: 0.981651376146789


  0%|          | 1/218 [00:00<00:01, 208.04it/s]


Bootstrapped 1 full traces after 2 examples in round 0.


Average Metric: 158 / 218  (72.5): 100%|██████████| 218/218 [00:00<00:00, 246.00it/s]


Score: 72.48 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47, 72.48, 70.64, 72.94, 72.48, 68.81, 72.94, 70.18, 72.48]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.9036697247706422
Average of max per entry across top 9999 scores: 0.981651376146789


  1%|▏         | 3/218 [00:00<00:01, 173.38it/s]


Bootstrapped 2 full traces after 4 examples in round 0.


Average Metric: 163 / 218  (74.8): 100%|██████████| 218/218 [00:01<00:00, 194.80it/s]

Score: 74.77 for set: [16]
Scores so far: [48.62, 75.23, 80.73, 76.61, 70.18, 75.69, 72.48, 78.44, 75.23, 61.47, 72.48, 70.64, 72.94, 72.48, 68.81, 72.94, 70.18, 72.48, 74.77]
Best score: 80.73
Average of max per entry across top 1 scores: 0.8073394495412844
Average of max per entry across top 2 scores: 0.8577981651376146
Average of max per entry across top 3 scores: 0.8715596330275229
Average of max per entry across top 5 scores: 0.8853211009174312
Average of max per entry across top 8 scores: 0.9036697247706422
Average of max per entry across top 9999 scores: 0.9862385321100917
19 candidate programs found.





In [18]:
from dspy.evaluate import Evaluate
evaluator = Evaluate(devset = eval_dataset, display_progress = True, display_table = True)
metric, results = evaluator(optimized_predictor, metric=label_match, return_outputs=True)

Average Metric: 73 / 94  (77.7): 100%|██████████| 94/94 [00:00<00:00, 205.22it/s] 


Unnamed: 0,premise,hypothesis,example_label,reasoning,pred_label,label_match
0,"DEFENDANT has agreed to a $5.25 million settlement to benefit individuals who were required to provide fingerprints to access the company's facilities between March 29,...","As a regular visitor to a certain company's facility for several years, I always found their security protocol intriguing. They had this unique fingerprint access...",Entailed,"```json {  ""value"": ""Entailed"" } ```",Entailed,✔️ [True]
1,The DEFENDANT Text Message Class Action Settlement has been granted final approval by the court. Consumers who received unwanted text messages from DEFENDANT between July...,"Been receiving way too many texts from DEFENDANT lately, thought I was on the Do Not Call list? Love their clothes, but not the spam!",Entailed,"```json {  ""value"": ""Entailed"" } ```",Entailed,✔️ [True]
2,DEFENDANT has agreed to pay $7.5 million to settle allegations of wage-and-hour violations brought by current and former hourly healthcare workers. The settlement benefits individuals...,"Stumbled upon my former employer in the news today, quite an interesting read. As one of the hourly healthcare workers, I've got to say, they...",Contradict,"```json {  ""value"": ""Contradict"" } ```",Contradict,✔️ [True]
3,"DEFENDANT, a hospital in Dixon, Illinois, has agreed to pay $380,000 as part of a settlement to resolve allegations of a data breach that occurred...","So, there's this hospital in Dixon I went to a while back. Pretty standard stuff, nothing out of the ordinary. They've got this system that...",Neutral,"```json {  ""value"": ""Neutral"" } ```",Neutral,✔️ [True]
4,"DEFENDANT, a company that provides ambulance and emergency transportation services in the western suburbs of Chicago, has agreed to pay over $300,000 to settle a...","Recently started using the handprint clock-in system at work, a truly advanced method for timekeeping, love how technology has made life easier!",Neutral,"```json {  ""value"": ""Neutral"" } ```",Neutral,✔️ [True]
5,DEFENDANT has agreed to settle a class action lawsuit regarding a data breach that occurred in August 2021. The breach resulted from DEFENDANT's alleged failure...,"Feeling secure with my data, even with the recent news swirling around. Trust is key, and I've got faith in the system keeping our info...",Contradict,"```json {  ""value"": ""Contradict"" } ```",Contradict,✔️ [True]
6,DEFENDANT has agreed to pay $4.5 million to settle a class action lawsuit alleging violations of the Telephone Consumer Protection Act (TCPA) through unsolicited phone...,"Received a couple of promotional calls from a certain company, but they were quite professional and respectful. They always checked if I had time for...",Contradict,"```json {  ""value"": ""Contradict"" } ```",Contradict,✔️ [True]
7,DEFENDANT has agreed to a class action lawsuit settlement regarding defective gateway control modules in certain vehicles. The settlement benefits owners and lessees of specific...,"Can't believe how much of a headache my car been giving me lately! Random issues popping up everywhere - airbags, drivetrain, you name it. Never...",Entailed,"```json {  ""value"": ""Entailed"" } ```",Entailed,✔️ [True]
8,"DEFENDANT, a medical device company headquartered in Minnesota, has agreed to a $825,000 settlement to resolve claims related to a June 2021 ransomware data breach....",Thrilled to be part of a forward-thinking medical device company that values data security. Never had any issues with data breaches or lack of cybersecurity...,Contradict,"```json {  ""value"": ""Contradict"" } ```",Contradict,✔️ [True]
9,Consumers who received a data breach notification from DEFENDANT informing them that their personal information and/or protected health information may have been compromised in a...,"Experiencing a peculiar unease after receiving an unexpected notification from DEFENDANT, my trust seems to be wavering. Personal information compromised, it's unsettling.",Entailed,"```json {  ""value"": ""Entailed"" } ```",Entailed,✔️ [True]


In [19]:
results[0][1]

Prediction(
    reasoning='```json\n{\n  "value": "Entailed"\n}\n```',
    label='Entailed'
)

In [20]:
y_true = []
y_pred = []

for result in results:
    y_true.append(result[0]['label'])
    y_pred.append(result[1]['label'])

In [23]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()


# Fit the encoder on the combined labels (true + predicted) and transform
y_true_encoded = label_encoder.fit_transform(y_true)
y_pred_encoded = label_encoder.transform(y_pred)

In [24]:
# Calculate F1 macro score
f1_macro = f1_score(y_true_encoded, y_pred_encoded, average='macro')

print("F1 Macro Score:", f1_macro)

F1 Macro Score: 0.7669336002669337


In [25]:
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy Score:", accuracy)

Accuracy Score: 0.776595744680851


In [22]:
optimized_predictor.save("nli_optimized_predictor")

[('predictor', Predict(StringSignature(premise, hypothesis -> reasoning, label
    instructions='Please classify the relationship between a legal premise and a hypothesis into one of three categories: Entailed, Contradict, Neutral.'
    premise = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Premise:', 'desc': '${premise}'})
    hypothesis = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Hypothesis:', 'desc': '${hypothesis}'})
    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the label}. We ...', '__dspy_field_type': 'output'})
    label = Field(annotation=Literal['Entailed', 'Contradict', 'Neutral'] required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Label:', 'desc': '${label}'})
)))]
