From 547f064ab738692ac71a5b9dd667ef361b9dc0d0 Mon Sep 17 00:00:00 2001 From: stephantul Date: Sat, 15 Feb 2025 13:18:02 +0100 Subject: [PATCH] add multilabel targets, fix tests --- model2vec/inference/model.py | 56 +++++++++++++++++++++++++++++++---- model2vec/train/classifier.py | 5 ++-- tests/test_inference.py | 18 ++++++++--- uv.lock | 2 +- 4 files changed, 67 insertions(+), 14 deletions(-) diff --git a/model2vec/inference/model.py b/model2vec/inference/model.py index 5b08dad8..6a2fe6a6 100644 --- a/model2vec/inference/model.py +++ b/model2vec/inference/model.py @@ -7,6 +7,7 @@ import huggingface_hub import numpy as np import skops.io +from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline from model2vec.hf_utils import _create_model_card @@ -21,6 +22,20 @@ def __init__(self, model: StaticModel, head: Pipeline) -> None: """Create a pipeline with a StaticModel encoder.""" self.model = model self.head = head + classifier = self.head[-1] + # Check if the classifier is a multilabel classifier. + # NOTE: this doesn't look robust, but it is. + # Different classifiers, such as OVR wrappers, support multilabel output natively, so we + # can just use predict. + self.multilabel = False + if isinstance(classifier, MLPClassifier): + if classifier.out_activation_ == "logistic": + self.multilabel = True + + @property + def classes_(self) -> np.ndarray: + """The classes of the classifier.""" + return self.head.classes_ @classmethod def from_pretrained( @@ -60,7 +75,7 @@ def push_to_hub(self, repo_id: str, token: str | None = None, private: bool = Fa self.model.save_pretrained(temp_dir) push_folder_to_hub(Path(temp_dir), repo_id, private, token) - def _predict_and_coerce_to_2d( + def _encode_and_coerce_to_2d( self, X: list[str] | str, show_progress_bar: bool, @@ -69,7 +84,7 @@ def _predict_and_coerce_to_2d( use_multiprocessing: bool, multiprocessing_threshold: int, ) -> np.ndarray: - """Predict the labels of the input and coerce the output to a matrix.""" + """Encode the instances and coerce the output to a matrix.""" encoded = self.model.encode( X, show_progress_bar=show_progress_bar, @@ -91,9 +106,21 @@ def predict( batch_size: int = 1024, use_multiprocessing: bool = True, multiprocessing_threshold: int = 10_000, + threshold: float = 0.5, ) -> np.ndarray: - """Predict the labels of the input.""" - encoded = self._predict_and_coerce_to_2d( + """ + Predict the labels of the input. + + :param X: The input data to predict. Can be a list of strings or a single string. + :param show_progress_bar: Whether to display a progress bar during prediction. Defaults to False. + :param max_length: The maximum length of the input sequences. Defaults to 512. + :param batch_size: The batch size for prediction. Defaults to 1024. + :param use_multiprocessing: Whether to use multiprocessing for encoding. Defaults to True. + :param multiprocessing_threshold: The threshold for the number of samples to use multiprocessing. Defaults to 10,000. + :param threshold: The threshold for multilabel classification. Defaults to 0.5. Ignored if not multilabel. + :return: The predicted labels or probabilities. + """ + encoded = self._encode_and_coerce_to_2d( X, show_progress_bar=show_progress_bar, max_length=max_length, @@ -102,6 +129,13 @@ def predict( multiprocessing_threshold=multiprocessing_threshold, ) + if self.multilabel: + out_labels = [] + proba = self.head.predict_proba(encoded) + for vector in proba: + out_labels.append(self.classes_[vector > threshold]) + return np.asarray(out_labels) + return self.head.predict(encoded) def predict_proba( @@ -113,8 +147,18 @@ def predict_proba( use_multiprocessing: bool = True, multiprocessing_threshold: int = 10_000, ) -> np.ndarray: - """Predict the probabilities of the labels of the input.""" - encoded = self._predict_and_coerce_to_2d( + """ + Predict the labels of the input. + + :param X: The input data to predict. Can be a list of strings or a single string. + :param show_progress_bar: Whether to display a progress bar during prediction. Defaults to False. + :param max_length: The maximum length of the input sequences. Defaults to 512. + :param batch_size: The batch size for prediction. Defaults to 1024. + :param use_multiprocessing: Whether to use multiprocessing for encoding. Defaults to True. + :param multiprocessing_threshold: The threshold for the number of samples to use multiprocessing. Defaults to 10,000. + :return: The predicted labels or probabilities. + """ + encoded = self._encode_and_coerce_to_2d( X, show_progress_bar=show_progress_bar, max_length=max_length, diff --git a/model2vec/train/classifier.py b/model2vec/train/classifier.py index 08f56dce..16a47eba 100644 --- a/model2vec/train/classifier.py +++ b/model2vec/train/classifier.py @@ -323,8 +323,8 @@ def to_pipeline(self) -> StaticModelPipeline: # To convert correctly, we need to set the outputs correctly, and fix the activation function. # Make sure n_outputs is set to > 1. mlp_head.n_outputs_ = self.out_dim - # Set to softmax - mlp_head.out_activation_ = "softmax" + # Set to softmax or sigmoid + mlp_head.out_activation_ = "logistic" if self.multilabel else "softmax" return StaticModelPipeline(static_model, converted) @@ -373,7 +373,6 @@ def configure_optimizers(self) -> OptimizerLRScheduler: mode="min", factor=0.5, patience=3, - verbose=True, min_lr=1e-6, threshold=0.03, threshold_mode="rel", diff --git a/tests/test_inference.py b/tests/test_inference.py index 9f4618df..9a12894b 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -10,8 +10,13 @@ def test_init_predict(mock_inference_pipeline: StaticModelPipeline) -> None: """Test successful initialization of StaticModelPipeline.""" - assert mock_inference_pipeline.predict("dog").tolist() == ["b"] - assert mock_inference_pipeline.predict(["dog"]).tolist() == ["b"] + target: list[str] | list[list[str]] + if mock_inference_pipeline.multilabel: + target = [["a", "b"]] + else: + target = ["b"] + assert mock_inference_pipeline.predict("dog").tolist() == target + assert mock_inference_pipeline.predict(["dog"]).tolist() == target def test_init_predict_proba(mock_inference_pipeline: StaticModelPipeline) -> None: @@ -25,8 +30,13 @@ def test_roundtrip_save(mock_inference_pipeline: StaticModelPipeline) -> None: with TemporaryDirectory() as temp_dir: mock_inference_pipeline.save_pretrained(temp_dir) loaded = StaticModelPipeline.from_pretrained(temp_dir) - assert loaded.predict("dog") == ["b"] - assert loaded.predict(["dog"]) == ["b"] + target: list[str] | list[list[str]] + if mock_inference_pipeline.multilabel: + target = [["a", "b"]] + else: + target = ["b"] + assert loaded.predict("dog").tolist() == target + assert loaded.predict(["dog"]).tolist() == target assert loaded.predict_proba("dog").argmax() == 1 assert loaded.predict_proba(["dog"]).argmax(1).tolist() == [1] diff --git a/uv.lock b/uv.lock index f7d37b8c..a0465750 100644 --- a/uv.lock +++ b/uv.lock @@ -791,7 +791,7 @@ wheels = [ [[package]] name = "model2vec" -version = "0.3.8" +version = "0.4.0" source = { editable = "." } dependencies = [ { name = "jinja2" },