In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from pprint import pprint

In [3]:
from evalem._base.structures import (
    PredictionDTO,
    ReferenceDTO,
    EvaluationDTO,
    PredictionInstance,
    ReferenceInstance
)
from evalem.misc.utils import format_to_jury

# Generate dummy data

In [4]:
# Single References (SR)
references = [
    ReferenceDTO(value="Reference 1"),
    ReferenceDTO(value="Reference 2")
]
format_to_jury(references)

['Reference 1', 'Reference 2']

In [5]:
# Multiple References (MR)
references = [
    [
        ReferenceDTO("Reference 1"),
        dict(value="Reference 1.1"),
                dict(value="Reference 2"),

    ],
    [
        dict(value="Reference 2"),
        ReferenceDTO("Dummy 1"),
        ReferenceDTO("Dummy 2"),
        "Dummy 3"
    ]
]
format_to_jury(references)

[['Reference 1', 'Reference 1.1', 'Reference 2'],
 ['Reference 2', 'Dummy 1', 'Dummy 2', 'Dummy 3']]

In [6]:
predictions = [
    PredictionDTO(value="Reference 1", score=1.0),
    PredictionDTO(value="Reference 2.5", score=0.75)
]

# Evaluation

All the evaluation metric takes in same format (references and predictions).
And each metric results in same data structure of MetricResult.

In [7]:
from evalem._base.evaluators import (
    Evaluator,
)
from evalem._base.structures import MetricResult

In [8]:
# Base metrics
from evalem._base.metrics import (
    Metric,
    JuryBasedMetric,
    AccuracyMetric,
    PrecisionMetric,
    RecallMetric,
    F1Metric,
    BasicMetric,
    ConfusionMatrix
)

## Basic Metrics

In [9]:
pprint(PrecisionMetric()(references=references, predictions=predictions))

MetricResult(score=0.8333333333333333,
             total_items=2,
             metric_name='PrecisionMetric',
             empty_items=0,
             extra={'precision': {'score': 0.8333333333333333}})


In [10]:
pprint(RecallMetric()(references=references, predictions=predictions))

MetricResult(score=1.0,
             total_items=2,
             metric_name='RecallMetric',
             empty_items=0,
             extra={'recall': {'score': 1.0}})


In [11]:
pprint(AccuracyMetric()(references=references, predictions=predictions))

MetricResult(score=0.8333333333333333,
             total_items=2,
             metric_name='AccuracyMetric',
             empty_items=0,
             extra={'accuracy': {'score': 0.8333333333333333}})


In [12]:
pprint(ConfusionMatrix()(references=references, predictions=predictions))

MetricResult(score=None,
             total_items=7,
             metric_name='ConfusionMatrix',
             empty_items=0,
             extra={'confusion_matrix': array([[0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0]]),
                    'flattened': True,
                    'labels': ['Dummy 1',
                               'Dummy 2',
                               'Dummy 3',
                               'Reference 1',
                               'Reference 1.1',
                               'Reference 2',
                               'Reference 2.5']})


## Wrap with evaluator

In [13]:
evaluator = Evaluator(metrics=[
    PrecisionMetric(),
    RecallMetric(),
    F1Metric(),
    AccuracyMetric()
])

In [14]:
pprint(evaluator(references=references, predictions=predictions))

[MetricResult(score=0.8333333333333333,
              total_items=2,
              metric_name='PrecisionMetric',
              empty_items=0,
              extra={'precision': {'score': 0.8333333333333333}}),
 MetricResult(score=1.0,
              total_items=2,
              metric_name='RecallMetric',
              empty_items=0,
              extra={'recall': {'score': 1.0}}),
 MetricResult(score=0.9,
              total_items=2,
              metric_name='F1Metric',
              empty_items=0,
              extra={'f1': {'score': 0.9}}),
 MetricResult(score=0.8333333333333333,
              total_items=2,
              metric_name='AccuracyMetric',
              empty_items=0,
              extra={'accuracy': {'score': 0.8333333333333333}})]


## NLP metric

All the NLP metrics are derived from `NLPMetric` base class (which is directly inherited from `evalem._base.Metric`).

`evalem.nlp.SemanticMetric` represents semantic metrics which are bert scores and the likes.

In [15]:
from evalem.nlp.metrics import NLPMetric, SemanticMetric

from evalem.nlp.metrics import (
    BartScore,
    BertScore,
    BleuMetric,
    ExactMatchMetric,
    MeteorMetric,
    RougeMetric,
    SacreBleuMetric,
    
)

In [16]:
references = [
    "I love NLP",
    "I love working with language models",
    "I love my cat"
]

In [17]:
predictions = [
    "I don't really like doing NLP",
    "Language models are okay",
    "I absolutely love my cat"
]

In [18]:
pprint(BertScore(device="cpu")(references=references, predictions=predictions))

MetricResult(score=0.7106098333994547,
             total_items=3,
             metric_name='BertScore',
             empty_items=0,
             extra={'bertscore': {'f1': 0.7106098333994547,
                                  'hashcode': 'bert-base-uncased_L9_no-idf_version=0.3.12(hug_trans=4.28.1)',
                                  'precision': 0.6862475474675497,
                                  'recall': 0.7450003822644552,
                                  'score': 0.7106098333994547}})


In [19]:
pprint(BertScore(device="cpu")(references=references, predictions=predictions))

MetricResult(score=0.7106098333994547,
             total_items=3,
             metric_name='BertScore',
             empty_items=0,
             extra={'bertscore': {'f1': 0.7106098333994547,
                                  'hashcode': 'bert-base-uncased_L9_no-idf_version=0.3.12(hug_trans=4.28.1)',
                                  'precision': 0.6862475474675497,
                                  'recall': 0.7450003822644552,
                                  'score': 0.7106098333994547}})


In [20]:
pprint(ExactMatchMetric()(references=references, predictions=predictions))



MetricResult(score=0.0,
             total_items=3,
             metric_name='ExactMatchMetric',
             empty_items=0,
             extra={'exact_match': 0.0, 'flattened': True})


In [21]:
pprint(MeteorMetric()(references=references, predictions=predictions))

MetricResult(score=0.5106758851564175,
             total_items=3,
             metric_name='MeteorMetric',
             empty_items=0,
             extra={'meteor': {'score': 0.5106758851564175}})


In [22]:
evaluator_nlp = Evaluator(metrics=[
    BertScore(device="cpu"),
    BartScore(device="cpu"),
    ExactMatchMetric(),
    MeteorMetric(),
    RougeMetric(),
    SacreBleuMetric(),
    BleuMetric()
])

In [23]:
result = evaluator_nlp(references=references, predictions=predictions)

In [24]:
pprint(result)

[MetricResult(score=0.7106098333994547,
              total_items=3,
              metric_name='BertScore',
              empty_items=0,
              extra={'bertscore': {'f1': 0.7106098333994547,
                                   'hashcode': 'bert-base-uncased_L9_no-idf_version=0.3.12(hug_trans=4.28.1)',
                                   'precision': 0.6862475474675497,
                                   'recall': 0.7450003822644552,
                                   'score': 0.7106098333994547}}),
 MetricResult(score=-3.517256816228231,
              total_items=3,
              metric_name='BartScore',
              empty_items=0,
              extra={'flattened': True,
                     'model_checkpoint': 'bartscore-large-cnn'}),
 MetricResult(score=0.0,
              total_items=3,
              metric_name='ExactMatchMetric',
              empty_items=0,
              extra={'exact_match': 0.0, 'flattened': True}),
 MetricResult(score=0.5106758851564175,
              tot

# Model wrappers

In [25]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

## QA

In [26]:
from evalem.nlp.models import QuestionAnsweringHFPipelineWrapper
from evalem.nlp.structures import QuestionAnsweringDTO

In [27]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

In [28]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")

In [29]:
wrapped_model = QuestionAnsweringHFPipelineWrapper(
    model=model,
    tokenizer=tokenizer,
    device="mps"
)

In [30]:
# Random gpt-4 generated samples for testing
data = [
    {
        "context": "Deep within the labyrinthine caves, echoes tell tales of lost civilizations.",
        "question": "What sails toward the unknown?",
        "reference": "A lone ship"
    },
    {
        "context": "Beneath the bustling city streets, forgotten catacombs hold secrets of the past.",
        "question": "What watches the skies?",
        "reference": "The old observatory"
    },
    {
        "context": "In the heart of the ancient forest, a hidden lake glimmers under the moonlight.",
        "question": "What lies in the heart of the forest?",
        "reference": "A hidden lake"
    },
    {
        "context": "Beneath the bustling city streets, forgotten catacombs hold secrets of the past.",
        "question": "What lies in the heart of the forest?",
        "reference": "Forgotten catacombs"
    },
    {
        "context": "Deep within the labyrinthine caves, echoes tell tales of lost civilizations.",
        "question": "What lies in the heart of the forest?",
        "reference": "Labyrinthine caves"
    }
]

In [31]:
# forward pass
wrapped_model([ dict(context=d["context"], question=d["context"]) for d in data])

[QuestionAnsweringDTO(value='Deep within the labyrinthine caves, echoes tell tales of lost civilizations', score=0.27572375535964966, start=0, end=75, context=None, question=None),
 QuestionAnsweringDTO(value='forgotten catacombs hold secrets of the past', score=0.2389649599790573, start=35, end=79, context=None, question=None),
 QuestionAnsweringDTO(value='a hidden lake glimmers under the moonlight', score=0.08825943619012833, start=36, end=78, context=None, question=None),
 QuestionAnsweringDTO(value='forgotten catacombs hold secrets of the past', score=0.2389649599790573, start=35, end=79, context=None, question=None),
 QuestionAnsweringDTO(value='Deep within the labyrinthine caves, echoes tell tales of lost civilizations', score=0.27572375535964966, start=0, end=75, context=None, question=None)]

## QA Evaluator

In [32]:
from evalem import SimpleEvaluationPipeline
from evalem.nlp.evaluators import QAEvaluator

In [33]:
eval_pipe = SimpleEvaluationPipeline(
    model=wrapped_model,
    evaluators=[QAEvaluator()]
)

In [34]:
inputs = [ dict(context=d["context"], question=d["context"]) for d in data]
references = list(map(lambda x: x["reference"], data))

In [35]:
# We can directly run the evaluator by providing references and predictions
result = QAEvaluator()(
    references=references,
    predictions=wrapped_model(inputs)
)

In [36]:
# Or the evaluation pipeline will take care of the forward pass
results = eval_pipe(
    inputs,
    references,
    model_params=dict(batch_size=1)
)

In [37]:
pprint(results)

[[MetricResult(score=0.20779220779220778,
               total_items=5,
               metric_name='AccuracyMetric',
               empty_items=0,
               extra={'accuracy': {'score': 0.20779220779220778}}),
  MetricResult(score=0.0,
               total_items=5,
               metric_name='ExactMatchMetric',
               empty_items=0,
               extra={'exact_match': 0.0, 'flattened': True}),
  MetricResult(score=0.3168316831683168,
               total_items=5,
               metric_name='F1Metric',
               empty_items=0,
               extra={'f1': {'score': 0.3168316831683168}})]]


## Text Classification

In [38]:
from transformers import AutoModelForSequenceClassification

In [39]:
from evalem.nlp.models import TextClassificationHFPipelineWrapper

In [40]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [41]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [42]:
wrapped_model = TextClassificationHFPipelineWrapper(
    model=model,
    tokenizer=tokenizer
)

In [43]:
wrapped_model("I do like mangoes")

[ClassificationDTO(value='POSITIVE', score=0.9972068667411804)]

## Text Classification Evaluator

In [44]:
from evalem.nlp.evaluators import TextClassificationEvaluator

In [45]:
from evalem.nlp.misc.datasets import get_imdb

In [46]:
inputs = [
    "I love NLP",
    "I love working with language models",
    "I love my cat",
    "I don't like mangoes"
]

references = [
    "POSITIVE",
    "POSITIVE",
    "POSITIVE",
    "NEGATIVE"
]

In [47]:
# data = get_imdb("test", nsamples=25, shuffle=True)

In [49]:
wrapped_model(inputs)

[ClassificationDTO(value='POSITIVE', score=0.9997692704200745),
 ClassificationDTO(value='POSITIVE', score=0.9984956979751587),
 ClassificationDTO(value='POSITIVE', score=0.9998416900634766),
 ClassificationDTO(value='NEGATIVE', score=0.9931724667549133)]

In [50]:
evaluator = TextClassificationEvaluator()

In [51]:
result = evaluator(
    references=references,
    predictions=wrapped_model(inputs)
)

In [52]:
pprint(result)

[MetricResult(score=1.0,
              total_items=4,
              metric_name='AccuracyMetric',
              empty_items=0,
              extra={'accuracy': {'score': 1.0}}),
 MetricResult(score=1.0,
              total_items=4,
              metric_name='F1Metric',
              empty_items=0,
              extra={'f1': {'score': 1.0}}),
 MetricResult(score=1.0,
              total_items=4,
              metric_name='PrecisionMetric',
              empty_items=0,
              extra={'precision': {'score': 1.0}}),
 MetricResult(score=1.0,
              total_items=4,
              metric_name='RecallMetric',
              empty_items=0,
              extra={'recall': {'score': 1.0}}),
 MetricResult(score=None,
              total_items=4,
              metric_name='ConfusionMatrix',
              empty_items=0,
              extra={'confusion_matrix': array([[1, 0],
       [0, 3]]),
                     'flattened': True,
                     'labels': ['NEGATIVE', 'POSITIVE']})]


In [53]:
# instead of manually doing forward pass for wrapped model, wrap it up
eval_pipe = SimpleEvaluationPipeline(
    model=wrapped_model,
    evaluators=[TextClassificationEvaluator()]
)

In [54]:
result = eval_pipe(inputs, references)

In [55]:
pprint(result)

[[MetricResult(score=1.0,
               total_items=4,
               metric_name='AccuracyMetric',
               empty_items=0,
               extra={'accuracy': {'score': 1.0}}),
  MetricResult(score=1.0,
               total_items=4,
               metric_name='F1Metric',
               empty_items=0,
               extra={'f1': {'score': 1.0}}),
  MetricResult(score=1.0,
               total_items=4,
               metric_name='PrecisionMetric',
               empty_items=0,
               extra={'precision': {'score': 1.0}}),
  MetricResult(score=1.0,
               total_items=4,
               metric_name='RecallMetric',
               empty_items=0,
               extra={'recall': {'score': 1.0}}),
  MetricResult(score=None,
               total_items=4,
               metric_name='ConfusionMatrix',
               empty_items=0,
               extra={'confusion_matrix': array([[1, 0],
       [0, 3]]),
                      'flattened': True,
                      'labels': ['

# Compose eny evaluation

In [56]:
evaluators = [
    Evaluator(metrics=[
        AccuracyMetric(),
        ConfusionMatrix(),
        ExactMatchMetric(),
        F1Metric(),
    ]),
    Evaluator(metrics=[
        BertScore(),
        BartScore()
    ])
]



In [57]:
eval_pipe = SimpleEvaluationPipeline(
    model = wrapped_model,
    evaluators=evaluators
)

In [58]:
pprint(eval_pipe(inputs, references))

[[MetricResult(score=1.0,
               total_items=4,
               metric_name='AccuracyMetric',
               empty_items=0,
               extra={'accuracy': {'score': 1.0}}),
  MetricResult(score=None,
               total_items=4,
               metric_name='ConfusionMatrix',
               empty_items=0,
               extra={'confusion_matrix': array([[1, 0],
       [0, 3]]),
                      'flattened': True,
                      'labels': ['NEGATIVE', 'POSITIVE']}),
  MetricResult(score=1.0,
               total_items=4,
               metric_name='ExactMatchMetric',
               empty_items=0,
               extra={'exact_match': 1.0, 'flattened': True}),
  MetricResult(score=1.0,
               total_items=4,
               metric_name='F1Metric',
               empty_items=0,
               extra={'f1': {'score': 1.0}})],
 [MetricResult(score=0.9999999552965164,
               total_items=4,
               metric_name='BertScore',
               empty_items=0,
