MinishLab · Pringled · Feb 17, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/README.md b/README.md
@@ -112,8 +112,7 @@ ds = load_dataset("setfit/subj")
 classifier.fit(ds["train"]["text"], ds["train"]["label"])
 
 # Evaluate the classifier
-predictions = classifier.predict(ds["test"]["text"])
-accuracy = np.mean(np.array(predictions) == np.array(ds["test"]["label"])) * 100
+classification_report = classifier.evaluate(ds["test"]["text"], ds["test"]["label"])
 ```
 
 For advanced usage, please refer to our [usage documentation](https://github.com/MinishLab/model2vec/blob/main/docs/usage.md).

diff --git a/model2vec/inference/__init__.py b/model2vec/inference/__init__.py
@@ -5,6 +5,6 @@
 for extra_dependency in get_package_extras("model2vec", _REQUIRED_EXTRA):
     importable(extra_dependency, _REQUIRED_EXTRA)
 
-from model2vec.inference.model import StaticModelPipeline
+from model2vec.inference.model import StaticModelPipeline, evaluate_single_or_multi_label
 
-__all__ = ["StaticModelPipeline"]
+__all__ = ["StaticModelPipeline", "evaluate_single_or_multi_label"]
diff --git a/model2vec/inference/model.py b/model2vec/inference/model.py
@@ -3,19 +3,24 @@
 import re
 from pathlib import Path
 from tempfile import TemporaryDirectory
+from typing import TypeVar
 
 import huggingface_hub
 import numpy as np
 import skops.io
+from sklearn.metrics import classification_report
 from sklearn.neural_network import MLPClassifier
 from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import MultiLabelBinarizer
 
 from model2vec.hf_utils import _create_model_card
 from model2vec.model import PathLike, StaticModel
 
 _DEFAULT_TRUST_PATTERN = re.compile(r"sklearn\..+")
 _DEFAULT_MODEL_FILENAME = "pipeline.skops"
 
+LabelType = TypeVar("LabelType", list[str], list[list[str]])
+
 
 class StaticModelPipeline:
     def __init__(self, model: StaticModel, head: Pipeline) -> None:
@@ -169,6 +174,24 @@ def predict_proba(
 
         return self.head.predict_proba(encoded)
 
+    def evaluate(
+        self, X: list[str], y: LabelType, batch_size: int = 1024, threshold: float = 0.5, output_dict: bool = False
+    ) -> str | dict[str, dict[str, float]]:
+        """
+        Evaluate the classifier on a given dataset using scikit-learn's classification report.
+
+        :param X: The texts to predict on.
+        :param y: The ground truth labels.
+        :param batch_size: The batch size.
+        :param threshold: The threshold for multilabel classification.
+        :param output_dict: Whether to output the classification report as a dictionary.
+        :return: A classification report.
+        """
+        predictions = self.predict(X, show_progress_bar=True, batch_size=batch_size, threshold=threshold)
+        report = evaluate_single_or_multi_label(predictions=predictions, y=y, output_dict=output_dict)
+
+        return report
+
 
 def _load_pipeline(
     folder_or_repo_path: PathLike, token: str | None = None, trust_remote_code: bool = False
@@ -244,3 +267,41 @@ def save_pipeline(pipeline: StaticModelPipeline, folder_path: str | Path) -> Non
         language=pipeline.model.language,
         template_path="modelcards/classifier_template.md",
     )
+
+
+def _is_multi_label_shaped(y: LabelType) -> bool:
+    """Check if the labels are in a multi-label shape."""
+    return isinstance(y, (list, tuple)) and len(y) > 0 and isinstance(y[0], (list, tuple, set))
+
+
+def evaluate_single_or_multi_label(
+    predictions: np.ndarray,
+    y: LabelType,
+    output_dict: bool = False,
+) -> str | dict[str, dict[str, float]]:
+    """
+    Evaluate the classifier on a given dataset using scikit-learn's classification report.
+
+    :param predictions: The predictions.
+    :param y: The ground truth labels.
+    :param output_dict: Whether to output the classification report as a dictionary.
+    :return: A classification report.
+    """
+    if _is_multi_label_shaped(y):
+        classes = sorted(set([label for labels in y for label in labels]))
+        mlb = MultiLabelBinarizer(classes=classes)
+        y = mlb.fit_transform(y)
+        predictions = mlb.transform(predictions)
+    elif isinstance(y[0], (str, int)):
+        classes = sorted(set(y))
+
+    report = classification_report(
+        y,
+        predictions,
+        labels=np.arange(len(classes)),
+        target_names=[str(c) for c in classes],
+        output_dict=output_dict,
+        zero_division=0,
+    )
+
+    return report
diff --git a/model2vec/train/README.md b/model2vec/train/README.md
@@ -44,11 +44,10 @@ test = ds["test"]
 s = perf_counter()
 classifier = classifier.fit(train["text"], train["label"])
 
-predicted = classifier.predict(test["text"])
 print(f"Training took {int(perf_counter() - s)} seconds.")
 # Training took 81 seconds
-accuracy = np.mean([x == y for x, y in zip(predicted, test["label"])]) * 100
-print(f"Achieved {accuracy} test accuracy")
+classification_report = classifier.evaluate(ds["test"]["text"], ds["test"]["label"])
+print(classification_report)
 # Achieved 91.0 test accuracy
 ```
 
@@ -95,18 +94,8 @@ Then, we can evaluate the classifier:
 from sklearn import metrics
 from sklearn.preprocessing import MultiLabelBinarizer
 
-# Make predictions on the test set with a threshold of 0.3
-predictions = classifier.predict(ds["test"]["text"], threshold=0.3)
-
-# Evaluate the classifier
-mlb = MultiLabelBinarizer(classes=classifier.classes)
-y_true = mlb.fit_transform(ds["test"]["labels"])
-y_pred = mlb.transform(predictions)
-
-print(f"Accuracy: {metrics.accuracy_score(y_true, y_pred):.3f}")
-print(f"Precision: {metrics.precision_score(y_true, y_pred, average='macro', zero_division=0):.3f}")
-print(f"Recall: {metrics.recall_score(y_true, y_pred, average='macro', zero_division=0):.3f}")
-print(f"F1: {metrics.f1_score(y_true, y_pred, average='macro', zero_division=0):.3f}")
+classification_report = classifier.evaluate(ds["test"]["text"], ds["test"]["labels"], threshold=0.3)
+print(classification_report)
 # Accuracy: 0.410
 # Precision: 0.527
 # Recall: 0.410

diff --git a/model2vec/train/classifier.py b/model2vec/train/classifier.py
@@ -19,7 +19,7 @@
 from torch import nn
 from tqdm import trange
 
-from model2vec.inference import StaticModelPipeline
+from model2vec.inference import StaticModelPipeline, evaluate_single_or_multi_label
 from model2vec.train.base import FinetunableStaticModel, TextDataset
 
 logger = logging.getLogger(__name__)
@@ -227,6 +227,25 @@ def fit(
         self.eval()
         return self
 
+    def evaluate(
+        self, X: list[str], y: LabelType, batch_size: int = 1024, threshold: float = 0.5, output_dict: bool = False
+    ) -> str | dict[str, dict[str, float]]:
+        """
+        Evaluate the classifier on a given dataset using scikit-learn's classification report.
+
+        :param X: The texts to predict on.
+        :param y: The ground truth labels.
+        :param batch_size: The batch size.
+        :param threshold: The threshold for multilabel classification.
+        :param output_dict: Whether to output the classification report as a dictionary.
+        :return: A classification report.
+        """
+        self.eval()
+        predictions = self.predict(X, show_progress_bar=True, batch_size=batch_size, threshold=threshold)
+        report = evaluate_single_or_multi_label(predictions=predictions, y=y, output_dict=output_dict)
+
+        return report
+
     def _initialize(self, y: LabelType) -> None:
         """
         Sets the output dimensionality, the classes, and initializes the head.

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -83,22 +83,30 @@ def mock_inference_pipeline(mock_trained_pipeline: StaticModelForClassification)
     return mock_trained_pipeline.to_pipeline()
 
 
-@pytest.fixture(params=[False, True], ids=["single_label", "multilabel"], scope="session")
+@pytest.fixture(
+    params=[
+        (False, "single_label", "str"),
+        (False, "single_label", "int"),
+        (True, "multilabel", "str"),
+        (True, "multilabel", "int"),
+    ],
+    ids=lambda param: f"{param[1]}_{param[2]}",
+    scope="session",
+)
 def mock_trained_pipeline(request: pytest.FixtureRequest) -> StaticModelForClassification:
-    """Mock staticmodelforclassification."""
+    """Mock StaticModelForClassification with different label formats."""
     tokenizer = AutoTokenizer.from_pretrained("tests/data/test_tokenizer").backend_tokenizer
     torch.random.manual_seed(42)
     vectors_torched = torch.randn(len(tokenizer.get_vocab()), 12)
     model = StaticModelForClassification(vectors=vectors_torched, tokenizer=tokenizer, hidden_dim=12).to("cpu")
 
     X = ["dog", "cat"]
-    y: list[str] | list[list[str]]
-    if request.param:
-        # Use multilabel targets.
-        y = [["a", "b"], ["a"]]
+    is_multilabel, label_type = request.param[0], request.param[2]
+
+    if label_type == "str":
+        y = [["a", "b"], ["a"]] if is_multilabel else ["a", "b"]  # type: ignore
     else:
-        # Use singlelabel targets.
-        y = ["a", "b"]
+        y = [[0, 1], [0]] if is_multilabel else [0, 1]  # type: ignore
 
     model.fit(X, y)
 

diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -9,32 +9,60 @@
 
 
 def test_init_predict(mock_inference_pipeline: StaticModelPipeline) -> None:
-    """Test successful initialization of StaticModelPipeline."""
+    """Test successful init and predict with StaticModelPipeline."""
     target: list[str] | list[list[str]]
     if mock_inference_pipeline.multilabel:
-        target = [["a", "b"]]
+        if isinstance(mock_inference_pipeline.classes_[0], str):
+            target = [["a", "b"]]
+        else:
+            target = [[0, 1]]  # type: ignore
     else:
-        target = ["b"]
+        if isinstance(mock_inference_pipeline.classes_[0], str):
+            target = ["b"]
+        else:
+            target = [1]  # type: ignore
     assert mock_inference_pipeline.predict("dog").tolist() == target
     assert mock_inference_pipeline.predict(["dog"]).tolist() == target
 
 
 def test_init_predict_proba(mock_inference_pipeline: StaticModelPipeline) -> None:
-    """Test successful initialization of StaticModelPipeline."""
+    """Test successful init and predict_proba with StaticModelPipeline."""
     assert mock_inference_pipeline.predict_proba("dog").argmax() == 1
     assert mock_inference_pipeline.predict_proba(["dog"]).argmax(1).tolist() == [1]
 
 
+def test_init_evaluate(mock_inference_pipeline: StaticModelPipeline) -> None:
+    """Test successful init and evaluate with StaticModelPipeline."""
+    target: list[str] | list[list[str]]
+    if mock_inference_pipeline.multilabel:
+        if isinstance(mock_inference_pipeline.classes_[0], str):
+            target = [["a", "b"]]
+        else:
+            target = [[0, 1]]  # type: ignore
+    else:
+        if isinstance(mock_inference_pipeline.classes_[0], str):
+            target = ["b"]
+        else:
+            target = [1]  # type: ignore
+    mock_inference_pipeline.evaluate("dog", target)  # type: ignore
+
+
 def test_roundtrip_save(mock_inference_pipeline: StaticModelPipeline) -> None:
     """Test saving and loading the pipeline."""
     with TemporaryDirectory() as temp_dir:
         mock_inference_pipeline.save_pretrained(temp_dir)
         loaded = StaticModelPipeline.from_pretrained(temp_dir)
         target: list[str] | list[list[str]]
         if mock_inference_pipeline.multilabel:
-            target = [["a", "b"]]
+            if isinstance(mock_inference_pipeline.classes_[0], str):
+                target = [["a", "b"]]
+            else:
+                target = [[0, 1]]  # type: ignore
         else:
-            target = ["b"]
+            if isinstance(mock_inference_pipeline.classes_[0], str):
+                target = ["b"]
+            else:
+                target = [1]  # type: ignore
         assert loaded.predict("dog").tolist() == target
         assert loaded.predict(["dog"]).tolist() == target
         assert loaded.predict_proba("dog").argmax() == 1

diff --git a/tests/test_trainable.py b/tests/test_trainable.py
@@ -113,9 +113,15 @@ def test_predict(mock_trained_pipeline: StaticModelForClassification) -> None:
     """Test the predict function."""
     result = mock_trained_pipeline.predict(["dog cat", "dog"]).tolist()
     if mock_trained_pipeline.multilabel:
-        assert result == [["a", "b"], ["a", "b"]]
+        if type(mock_trained_pipeline.classes_[0]) == str:
+            assert result == [["a", "b"], ["a", "b"]]
+        else:
+            assert result == [[0, 1], [0, 1]]
     else:
-        assert result == ["b", "b"]
+        if type(mock_trained_pipeline.classes_[0]) == str:
+            assert result == ["b", "b"]
+        else:
+            assert result == [1, 1]
 
 
 def test_predict_proba(mock_trained_pipeline: StaticModelForClassification) -> None:
@@ -146,3 +152,19 @@ def test_train_test_split(mock_trained_pipeline: StaticModelForClassification) -
     assert len(b) == 2
     assert len(c) == len(a)
     assert len(d) == len(b)
+
+
+def test_evaluate(mock_trained_pipeline: StaticModelForClassification) -> None:
+    """Test the evaluate function."""
+    if mock_trained_pipeline.multilabel:
+        if type(mock_trained_pipeline.classes_[0]) == str:
+            mock_trained_pipeline.evaluate(["dog cat", "dog"], [["a", "b"], ["a"]])
+        else:
+            # Ignore the type error since we don't support int labels in our typing, but the code does
+            mock_trained_pipeline.evaluate(["dog cat", "dog"], [[0, 1], [0]])  # type: ignore
+    else:
+        if type(mock_trained_pipeline.classes_[0]) == str:
+            mock_trained_pipeline.evaluate(["dog cat", "dog"], ["a", "a"])
+        else:
+            # Ignore the type error since we don't support int labels in our typing, but the code does
+            mock_trained_pipeline.evaluate(["dog cat", "dog"], [1, 1])  # type: ignore