Output reason from classify_with_score().

LAHTeR · Sep 21, 2023 · 0df3c98 · 0df3c98
1 parent fcbe136
commit 0df3c98
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 13 deletions.
diff --git a/scripts/classify_text_quality.py b/scripts/classify_text_quality.py
@@ -30,8 +30,12 @@
 
 logging.basicConfig(level=LOG_LEVEL)
 
+REASON_FIELDNAME = "Reason"
+
 
 class OutputRow(TypedDict):
+    """Container class for the rows in the CSV output."""
+
     filename: str
     quality_class: int
 
@@ -77,7 +81,7 @@ class OutputRow(TypedDict):
     parser.add_argument(
         "--output-scores",
         action="store_true",
-        help="Output scores and text statistics.",
+        help="Output scores and text statistics, and reason for classification.",
     )
     args = parser.parse_args()
 
@@ -113,7 +117,7 @@ class OutputRow(TypedDict):
 
     fieldnames = list(OutputRow.__annotations__.keys())
     if args.output_scores:
-        fieldnames += list(ClassifierScores.__annotations__.keys())
+        fieldnames += list(ClassifierScores.__annotations__.keys()) + [REASON_FIELDNAME]
 
     writer = csv.DictWriter(args.output, fieldnames=fieldnames)
     writer.writeheader()
@@ -122,10 +126,13 @@ class OutputRow(TypedDict):
         (text_inputs | pagexml_inputs).items(), desc="Processing", unit="file"
     ):
         if args.output_scores:
-            quality_class, classifier_scores = pipeline.classify_with_scores(page)
+            quality_class, classifier_scores, reason = pipeline.classify_with_scores(
+                page
+            )
             row = (
                 OutputRow(filename=name, quality_class=quality_class)
                 | classifier_scores
+                | {REASON_FIELDNAME: reason.name}
             )
         else:
             row = OutputRow(filename=name, quality_class=pipeline.classify(page))

diff --git a/setup.cfg b/setup.cfg
@@ -39,6 +39,7 @@ install_requires =
     scikit-learn==1.2.1
     spylls==0.1.7
     tqdm==4.65.0
+    openpyxl~=3.1.2
 scripts =
     scripts/classify_text_quality.py
 

diff --git a/tests/classifier/test_pipeline.py b/tests/classifier/test_pipeline.py
@@ -4,7 +4,7 @@
 import sklearn
 from pagexml.model.physical_document_model import PageXMLScan
 from pagexml.model.physical_document_model import PageXMLTextLine
-from text_quality.classifier.pipeline import ClassifierScores
+from text_quality.classifier.pipeline import ClassifierScores, Reason
 from text_quality.classifier.pipeline import Pipeline
 from text_quality.classifier.pipeline import default_scores_dict
 from text_quality.feature.featurize import Scorers
@@ -57,7 +57,7 @@ def test_classify(self, pipeline, page, expected):
         assert pipeline.classify(page) == expected
 
     @pytest.mark.parametrize(
-        "text, expected_class, expected_scores",
+        "text, expected_class, expected_scores, expected_reason",
         [
             (
                 "",
@@ -71,6 +71,7 @@ def test_classify(self, pipeline, page, expected):
                     n_characters=0,
                     n_tokens=0,
                 ),
+                Reason.EMPTY,
             ),
             (
                 "een Nederlands tekst",
@@ -84,6 +85,7 @@ def test_classify(self, pipeline, page, expected):
                     n_characters=20,
                     n_tokens=3,
                 ),
+                Reason.CLASSIFIER,
             ),
             (
                 Page(PageXMLScan(lines=[PageXMLTextLine(text="test")])),
@@ -97,6 +99,7 @@ def test_classify(self, pipeline, page, expected):
                     n_characters=4,
                     n_tokens=0,
                 ),
+                Reason.SHORT_COLUMNS,
             ),
             (
                 Page(PageXMLScan(lines=[PageXMLTextLine(text="een Nederlands tekst")])),
@@ -110,6 +113,7 @@ def test_classify(self, pipeline, page, expected):
                     n_characters=20,
                     n_tokens=3,
                 ),
+                Reason.CLASSIFIER,
             ),
             (
                 Page(PageXMLScan(lines=[PageXMLTextLine(text="test")] * 10)),
@@ -123,15 +127,18 @@ def test_classify(self, pipeline, page, expected):
                     n_characters=49,
                     n_tokens=0,
                 ),
+                Reason.SHORT_COLUMNS,
             ),
         ],
     )
+    # pylint: disable=too-many-arguments
     def test_classify_with_scores(
-        self, pipeline, text, expected_class, expected_scores
+        self, pipeline, text, expected_class, expected_scores, expected_reason
     ):
-        quality, scores = pipeline.classify_with_scores(text)
+        quality, scores, reason = pipeline.classify_with_scores(text)
         assert quality == expected_class
         assert scores == pytest.approx(expected_scores)
+        assert reason == expected_reason
 
 
 @pytest.mark.parametrize(

diff --git a/text_quality/classifier/pipeline.py b/text_quality/classifier/pipeline.py
@@ -1,6 +1,8 @@
 """Classification pipeline."""
 
 import logging
+from enum import Enum
+from enum import auto
 from pathlib import Path
 from typing import List
 from typing import TypedDict
@@ -24,6 +26,14 @@
 """Container class for the scores returned by the classifier."""
 
 
+class Reason(Enum):
+    """Reasons for the classification result."""
+
+    CLASSIFIER = auto()
+    SHORT_COLUMNS = auto()
+    EMPTY = auto()
+
+
 def default_scores_dict(default_value, **fields) -> ClassifierScores:
     """Generate a ClassifierScores dict with default values.
 
@@ -93,17 +103,18 @@ def _classify_pagexml(self, pagexml: Page) -> int:
 
     def classify_with_scores(
         self, page: Union[Page, str]
-    ) -> tuple[int, ClassifierScores]:
+    ) -> tuple[int, ClassifierScores, Reason]:
         """Single instance classification with scores."""
 
         if isinstance(page, Page):
-            quality, scores = self._classify_pagexml_with_scores(page)
+            quality, scores, reason = self._classify_pagexml_with_scores(page)
         elif self._is_short(page):
             logging.debug(
                 "Skipping short text: '%s' (%d characters).", page, len(page.strip())
             )
             quality = EMPTY_PAGE_OUTPUT
             scores = default_scores_dict(0, confidence=1.0, n_characters=len(page))
+            reason = Reason.EMPTY
         else:
             features, tokens = self._featurizer.featurize(page)
             features_df: pd.DataFrame = Featurizer.as_dataframe(features)
@@ -116,12 +127,13 @@ def classify_with_scores(
                 n_tokens=len(tokens),
                 **features,
             )
+            reason = Reason.CLASSIFIER
 
-        return quality, scores
+        return quality, scores, reason
 
     def _classify_pagexml_with_scores(
         self, pagexml: Page
-    ) -> tuple[int, ClassifierScores]:
+    ) -> tuple[int, ClassifierScores, Reason]:
         """Classify a Page object with scores."""
 
         if all(len(line) < SHORT_COLUMN_WIDTH for line in pagexml.lines()):
@@ -131,10 +143,11 @@ def _classify_pagexml_with_scores(
             scores = default_scores_dict(
                 0, confidence=1.0, n_characters=len(pagexml.get_text())
             )
+            reason = Reason.SHORT_COLUMNS
         else:
-            quality, scores = self.classify_with_scores(pagexml.get_text())
+            quality, scores, reason = self.classify_with_scores(pagexml.get_text())
 
-        return quality, scores
+        return quality, scores, reason
 
     @staticmethod
     def _is_short(text: str):