diff --git a/README.md b/README.md index 83f806ab..a51a1ddc 100644 --- a/README.md +++ b/README.md @@ -112,8 +112,7 @@ ds = load_dataset("setfit/subj") classifier.fit(ds["train"]["text"], ds["train"]["label"]) # Evaluate the classifier -predictions = classifier.predict(ds["test"]["text"]) -accuracy = np.mean(np.array(predictions) == np.array(ds["test"]["label"])) * 100 +classification_report = classifier.evaluate(ds["test"]["text"], ds["test"]["label"]) ``` For advanced usage, please refer to our [usage documentation](https://github.com/MinishLab/model2vec/blob/main/docs/usage.md). diff --git a/model2vec/inference/__init__.py b/model2vec/inference/__init__.py index de94e18d..e2499442 100644 --- a/model2vec/inference/__init__.py +++ b/model2vec/inference/__init__.py @@ -5,6 +5,6 @@ for extra_dependency in get_package_extras("model2vec", _REQUIRED_EXTRA): importable(extra_dependency, _REQUIRED_EXTRA) -from model2vec.inference.model import StaticModelPipeline +from model2vec.inference.model import StaticModelPipeline, evaluate_single_or_multi_label -__all__ = ["StaticModelPipeline"] +__all__ = ["StaticModelPipeline", "evaluate_single_or_multi_label"] diff --git a/model2vec/inference/model.py b/model2vec/inference/model.py index 6f5a17ba..db69ff97 100644 --- a/model2vec/inference/model.py +++ b/model2vec/inference/model.py @@ -3,12 +3,15 @@ import re from pathlib import Path from tempfile import TemporaryDirectory +from typing import TypeVar import huggingface_hub import numpy as np import skops.io +from sklearn.metrics import classification_report from sklearn.neural_network import MLPClassifier from sklearn.pipeline import Pipeline +from sklearn.preprocessing import MultiLabelBinarizer from model2vec.hf_utils import _create_model_card from model2vec.model import PathLike, StaticModel @@ -16,6 +19,8 @@ _DEFAULT_TRUST_PATTERN = re.compile(r"sklearn\..+") _DEFAULT_MODEL_FILENAME = "pipeline.skops" +LabelType = TypeVar("LabelType", list[str], list[list[str]]) + class StaticModelPipeline: def __init__(self, model: StaticModel, head: Pipeline) -> None: @@ -169,6 +174,24 @@ def predict_proba( return self.head.predict_proba(encoded) + def evaluate( + self, X: list[str], y: LabelType, batch_size: int = 1024, threshold: float = 0.5, output_dict: bool = False + ) -> str | dict[str, dict[str, float]]: + """ + Evaluate the classifier on a given dataset using scikit-learn's classification report. + + :param X: The texts to predict on. + :param y: The ground truth labels. + :param batch_size: The batch size. + :param threshold: The threshold for multilabel classification. + :param output_dict: Whether to output the classification report as a dictionary. + :return: A classification report. + """ + predictions = self.predict(X, show_progress_bar=True, batch_size=batch_size, threshold=threshold) + report = evaluate_single_or_multi_label(predictions=predictions, y=y, output_dict=output_dict) + + return report + def _load_pipeline( folder_or_repo_path: PathLike, token: str | None = None, trust_remote_code: bool = False @@ -244,3 +267,41 @@ def save_pipeline(pipeline: StaticModelPipeline, folder_path: str | Path) -> Non language=pipeline.model.language, template_path="modelcards/classifier_template.md", ) + + +def _is_multi_label_shaped(y: LabelType) -> bool: + """Check if the labels are in a multi-label shape.""" + return isinstance(y, (list, tuple)) and len(y) > 0 and isinstance(y[0], (list, tuple, set)) + + +def evaluate_single_or_multi_label( + predictions: np.ndarray, + y: LabelType, + output_dict: bool = False, +) -> str | dict[str, dict[str, float]]: + """ + Evaluate the classifier on a given dataset using scikit-learn's classification report. + + :param predictions: The predictions. + :param y: The ground truth labels. + :param output_dict: Whether to output the classification report as a dictionary. + :return: A classification report. + """ + if _is_multi_label_shaped(y): + classes = sorted(set([label for labels in y for label in labels])) + mlb = MultiLabelBinarizer(classes=classes) + y = mlb.fit_transform(y) + predictions = mlb.transform(predictions) + elif isinstance(y[0], (str, int)): + classes = sorted(set(y)) + + report = classification_report( + y, + predictions, + labels=np.arange(len(classes)), + target_names=[str(c) for c in classes], + output_dict=output_dict, + zero_division=0, + ) + + return report diff --git a/model2vec/train/README.md b/model2vec/train/README.md index 2c908fff..2d7aad2b 100644 --- a/model2vec/train/README.md +++ b/model2vec/train/README.md @@ -44,11 +44,10 @@ test = ds["test"] s = perf_counter() classifier = classifier.fit(train["text"], train["label"]) -predicted = classifier.predict(test["text"]) print(f"Training took {int(perf_counter() - s)} seconds.") # Training took 81 seconds -accuracy = np.mean([x == y for x, y in zip(predicted, test["label"])]) * 100 -print(f"Achieved {accuracy} test accuracy") +classification_report = classifier.evaluate(ds["test"]["text"], ds["test"]["label"]) +print(classification_report) # Achieved 91.0 test accuracy ``` @@ -95,18 +94,8 @@ Then, we can evaluate the classifier: from sklearn import metrics from sklearn.preprocessing import MultiLabelBinarizer -# Make predictions on the test set with a threshold of 0.3 -predictions = classifier.predict(ds["test"]["text"], threshold=0.3) - -# Evaluate the classifier -mlb = MultiLabelBinarizer(classes=classifier.classes) -y_true = mlb.fit_transform(ds["test"]["labels"]) -y_pred = mlb.transform(predictions) - -print(f"Accuracy: {metrics.accuracy_score(y_true, y_pred):.3f}") -print(f"Precision: {metrics.precision_score(y_true, y_pred, average='macro', zero_division=0):.3f}") -print(f"Recall: {metrics.recall_score(y_true, y_pred, average='macro', zero_division=0):.3f}") -print(f"F1: {metrics.f1_score(y_true, y_pred, average='macro', zero_division=0):.3f}") +classification_report = classifier.evaluate(ds["test"]["text"], ds["test"]["labels"], threshold=0.3) +print(classification_report) # Accuracy: 0.410 # Precision: 0.527 # Recall: 0.410 diff --git a/model2vec/train/classifier.py b/model2vec/train/classifier.py index 213b75ab..7189eb7b 100644 --- a/model2vec/train/classifier.py +++ b/model2vec/train/classifier.py @@ -19,7 +19,7 @@ from torch import nn from tqdm import trange -from model2vec.inference import StaticModelPipeline +from model2vec.inference import StaticModelPipeline, evaluate_single_or_multi_label from model2vec.train.base import FinetunableStaticModel, TextDataset logger = logging.getLogger(__name__) @@ -227,6 +227,25 @@ def fit( self.eval() return self + def evaluate( + self, X: list[str], y: LabelType, batch_size: int = 1024, threshold: float = 0.5, output_dict: bool = False + ) -> str | dict[str, dict[str, float]]: + """ + Evaluate the classifier on a given dataset using scikit-learn's classification report. + + :param X: The texts to predict on. + :param y: The ground truth labels. + :param batch_size: The batch size. + :param threshold: The threshold for multilabel classification. + :param output_dict: Whether to output the classification report as a dictionary. + :return: A classification report. + """ + self.eval() + predictions = self.predict(X, show_progress_bar=True, batch_size=batch_size, threshold=threshold) + report = evaluate_single_or_multi_label(predictions=predictions, y=y, output_dict=output_dict) + + return report + def _initialize(self, y: LabelType) -> None: """ Sets the output dimensionality, the classes, and initializes the head. diff --git a/tests/conftest.py b/tests/conftest.py index 2f258868..373d730f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -83,22 +83,30 @@ def mock_inference_pipeline(mock_trained_pipeline: StaticModelForClassification) return mock_trained_pipeline.to_pipeline() -@pytest.fixture(params=[False, True], ids=["single_label", "multilabel"], scope="session") +@pytest.fixture( + params=[ + (False, "single_label", "str"), + (False, "single_label", "int"), + (True, "multilabel", "str"), + (True, "multilabel", "int"), + ], + ids=lambda param: f"{param[1]}_{param[2]}", + scope="session", +) def mock_trained_pipeline(request: pytest.FixtureRequest) -> StaticModelForClassification: - """Mock staticmodelforclassification.""" + """Mock StaticModelForClassification with different label formats.""" tokenizer = AutoTokenizer.from_pretrained("tests/data/test_tokenizer").backend_tokenizer torch.random.manual_seed(42) vectors_torched = torch.randn(len(tokenizer.get_vocab()), 12) model = StaticModelForClassification(vectors=vectors_torched, tokenizer=tokenizer, hidden_dim=12).to("cpu") X = ["dog", "cat"] - y: list[str] | list[list[str]] - if request.param: - # Use multilabel targets. - y = [["a", "b"], ["a"]] + is_multilabel, label_type = request.param[0], request.param[2] + + if label_type == "str": + y = [["a", "b"], ["a"]] if is_multilabel else ["a", "b"] # type: ignore else: - # Use singlelabel targets. - y = ["a", "b"] + y = [[0, 1], [0]] if is_multilabel else [0, 1] # type: ignore model.fit(X, y) diff --git a/tests/test_inference.py b/tests/test_inference.py index 9a12894b..bae4732e 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -9,22 +9,44 @@ def test_init_predict(mock_inference_pipeline: StaticModelPipeline) -> None: - """Test successful initialization of StaticModelPipeline.""" + """Test successful init and predict with StaticModelPipeline.""" target: list[str] | list[list[str]] if mock_inference_pipeline.multilabel: - target = [["a", "b"]] + if isinstance(mock_inference_pipeline.classes_[0], str): + target = [["a", "b"]] + else: + target = [[0, 1]] # type: ignore else: - target = ["b"] + if isinstance(mock_inference_pipeline.classes_[0], str): + target = ["b"] + else: + target = [1] # type: ignore assert mock_inference_pipeline.predict("dog").tolist() == target assert mock_inference_pipeline.predict(["dog"]).tolist() == target def test_init_predict_proba(mock_inference_pipeline: StaticModelPipeline) -> None: - """Test successful initialization of StaticModelPipeline.""" + """Test successful init and predict_proba with StaticModelPipeline.""" assert mock_inference_pipeline.predict_proba("dog").argmax() == 1 assert mock_inference_pipeline.predict_proba(["dog"]).argmax(1).tolist() == [1] +def test_init_evaluate(mock_inference_pipeline: StaticModelPipeline) -> None: + """Test successful init and evaluate with StaticModelPipeline.""" + target: list[str] | list[list[str]] + if mock_inference_pipeline.multilabel: + if isinstance(mock_inference_pipeline.classes_[0], str): + target = [["a", "b"]] + else: + target = [[0, 1]] # type: ignore + else: + if isinstance(mock_inference_pipeline.classes_[0], str): + target = ["b"] + else: + target = [1] # type: ignore + mock_inference_pipeline.evaluate("dog", target) # type: ignore + + def test_roundtrip_save(mock_inference_pipeline: StaticModelPipeline) -> None: """Test saving and loading the pipeline.""" with TemporaryDirectory() as temp_dir: @@ -32,9 +54,15 @@ def test_roundtrip_save(mock_inference_pipeline: StaticModelPipeline) -> None: loaded = StaticModelPipeline.from_pretrained(temp_dir) target: list[str] | list[list[str]] if mock_inference_pipeline.multilabel: - target = [["a", "b"]] + if isinstance(mock_inference_pipeline.classes_[0], str): + target = [["a", "b"]] + else: + target = [[0, 1]] # type: ignore else: - target = ["b"] + if isinstance(mock_inference_pipeline.classes_[0], str): + target = ["b"] + else: + target = [1] # type: ignore assert loaded.predict("dog").tolist() == target assert loaded.predict(["dog"]).tolist() == target assert loaded.predict_proba("dog").argmax() == 1 diff --git a/tests/test_trainable.py b/tests/test_trainable.py index 2fd11e88..9add8b25 100644 --- a/tests/test_trainable.py +++ b/tests/test_trainable.py @@ -113,9 +113,15 @@ def test_predict(mock_trained_pipeline: StaticModelForClassification) -> None: """Test the predict function.""" result = mock_trained_pipeline.predict(["dog cat", "dog"]).tolist() if mock_trained_pipeline.multilabel: - assert result == [["a", "b"], ["a", "b"]] + if type(mock_trained_pipeline.classes_[0]) == str: + assert result == [["a", "b"], ["a", "b"]] + else: + assert result == [[0, 1], [0, 1]] else: - assert result == ["b", "b"] + if type(mock_trained_pipeline.classes_[0]) == str: + assert result == ["b", "b"] + else: + assert result == [1, 1] def test_predict_proba(mock_trained_pipeline: StaticModelForClassification) -> None: @@ -146,3 +152,19 @@ def test_train_test_split(mock_trained_pipeline: StaticModelForClassification) - assert len(b) == 2 assert len(c) == len(a) assert len(d) == len(b) + + +def test_evaluate(mock_trained_pipeline: StaticModelForClassification) -> None: + """Test the evaluate function.""" + if mock_trained_pipeline.multilabel: + if type(mock_trained_pipeline.classes_[0]) == str: + mock_trained_pipeline.evaluate(["dog cat", "dog"], [["a", "b"], ["a"]]) + else: + # Ignore the type error since we don't support int labels in our typing, but the code does + mock_trained_pipeline.evaluate(["dog cat", "dog"], [[0, 1], [0]]) # type: ignore + else: + if type(mock_trained_pipeline.classes_[0]) == str: + mock_trained_pipeline.evaluate(["dog cat", "dog"], ["a", "a"]) + else: + # Ignore the type error since we don't support int labels in our typing, but the code does + mock_trained_pipeline.evaluate(["dog cat", "dog"], [1, 1]) # type: ignore