FIX: Check current task without try/except

IndicoDataSolutions · Jul 17, 2019 · 9b2f98b · 9b2f98b
1 parent e9eb9d3
commit 9b2f98b
Show file tree

Hide file tree

Showing 8 changed files with 59 additions and 35 deletions.
diff --git a/finetune/base.py b/finetune/base.py
@@ -7,6 +7,7 @@
 import math
 from abc import ABCMeta, abstractmethod
 from copy import deepcopy
+from enum import Enum
 import tempfile
 import time
 import sys
@@ -716,26 +717,25 @@ def finetune_grid_search_cv(cls, Xs, Y, *, n_splits, test_size, eval_fn=None, pr
 
         return max(aggregated_results, key=lambda x: x[1])[0]
 
-    def process_long_sequence(self, X, task, probas=False):
-        tasks = ['sequence_labeling', 'classification', 'regression']
-        assert task in tasks, 'invalid task for processing long sequences'
+    class Task(Enum):
+        SEQUENCE_LABELING = 'sequence_labeling'
+
+
+    def process_long_sequence(self, X, probas=False):
         chunk_size = self.config.max_length - 2
         step_size = chunk_size // 3
-        if task == 'sequence_labeling':
-            arr_encoded = list(itertools.chain.from_iterable(self.input_pipeline._text_to_ids([x]) for x in X))
-        else:
-            arr_encoded = list(itertools.chain.from_iterable(self.input_pipeline._text_to_ids(x) for x in X))
-
+        arr_encoded = list(itertools.chain.from_iterable(self.input_pipeline._text_to_ids(x) 
+            for x in self.input_pipeline._format_for_inference(X)))
+
         labels, batch_probas = [], []
         pred_keys = [PredictMode.NORMAL]
-        if task != 'regression':
+        if probas:
             pred_keys.append(PredictMode.PROBAS)
         for pred in self._inference(X, predict_keys=[PredictMode.PROBAS, PredictMode.NORMAL], n_examples=len(arr_encoded)):
-            try:
-                labels.append(self.input_pipeline.label_encoder.inverse_transform([pred[PredictMode.NORMAL]]))
-            except ValueError:
-                labels.append(self.input_pipeline.label_encoder.inverse_transform(pred[PredictMode.NORMAL]))
-            if task != 'regression':
+            labels.append(self.input_pipeline.label_encoder.inverse_transform(
+                pred[PredictMode.NORMAL] if hasattr(self,'multi_label') else [pred[PredictMode.NORMAL]] # only wrap in list if not sequence labeling
+            ))
+            if probas:
                 batch_probas.append(pred[PredictMode.PROBAS])
 
         if not batch_probas:

diff --git a/finetune/config.py b/finetune/config.py
@@ -226,7 +226,7 @@ def get_default_config():
     """
     settings = Settings(
         # General Settings
-        low_memory_mode=False,
+        low_memory_mode=True,
         interpolate_pos_embed=False,
         save_adam_vars=True,
         shuffle_buffer_size=100,
@@ -302,7 +302,7 @@ def get_default_config():
         subtoken_predictions=False,
         multi_label_sequences=False,
         multi_label_threshold=0.5,
-        chunk_long_sequences=False,
+        chunk_long_sequences=True,
 
         # Regression Params
         regression_loss="L2",

diff --git a/finetune/target_models/classifier.py b/finetune/target_models/classifier.py
@@ -33,15 +33,8 @@ class Classifier(BaseModel):
     :param \**kwargs: key-value pairs of config items to override.
     """
 
-    defaults = {
-        "low_memory_mode": True,
-        "chunk_long_sequences": True
-    }
-
     def __init__(self, **kwargs):
-        d = copy.deepcopy(Classifier.defaults)
-        d.update(kwargs)
-        super().__init__(**d)
+        super().__init__(**kwargs)
 
     def _get_input_pipeline(self):
         return ClassificationPipeline(self.config)
@@ -55,7 +48,7 @@ def featurize(self, X):
         """
         return super().featurize(X)
 
-    def predict(self, X, probas=False):
+    def predict(self, X, probas=True):
         """
         Produces a list of most likely class labels as determined by the fine-tuned model.
 
@@ -69,14 +62,12 @@ def predict(self, X, probas=False):
         all_labels = []
         all_probs = []
 
-        for _, start_of_doc, end_of_doc, label, proba in self.process_long_sequence(X, task='classification'):
+        for _, start_of_doc, end_of_doc, _, proba in self.process_long_sequence(X, probas):
             start, end = 0, None
             if start_of_doc:
                 # if this is the first chunk in a document, start accumulating from scratch
-                doc_labels = []
                 doc_probs = []
 
-            doc_labels.append(label)
             doc_probs.append(proba)
 
             if end_of_doc:
@@ -101,7 +92,7 @@ def predict_proba(self, X):
         :param X: list or array of text to embed.
         :returns: list of dictionaries.  Each dictionary maps from a class label to its assigned class probability.
         """
-        return self.predict(X, probas=True)
+        return self.predict(X)
 
     def finetune(self, X, Y=None, batch_size=None):
         """

diff --git a/finetune/target_models/comparison.py b/finetune/target_models/comparison.py
@@ -40,6 +40,17 @@ class Comparison(Classifier):
     :param \**kwargs: key-value pairs of config items to override.
     """
 
+    defaults = {
+        "chunk_long_sequences": False
+    }
+
+    def __init__(self, **kwargs):
+        d = copy.deepcopy(Comparison.defaults)
+        d.update(kwargs)
+        super().__init__(**d)
+        if self.config.chunk_long_sequences:
+            raise FinetuneError("Multifield model is incompatible with chunk_long_sequences = True in config.")
+
     def _get_input_pipeline(self):
         return ComparisonPipeline(self.config)
 

diff --git a/finetune/target_models/multifield.py b/finetune/target_models/multifield.py
@@ -21,6 +21,17 @@ class MultiFieldClassifier(Classifier):
     :param \**kwargs: key-value pairs of config items to override.
     """
 
+    defaults = {
+        "chunk_long_sequences": False
+    }
+
+    def __init__(self, **kwargs):
+        d = copy.deepcopy(MultifieldClassifier.defaults)
+        d.update(kwargs)
+        super().__init__(**d)
+        if self.config.chunk_long_sequences:
+            raise FinetuneError("Multifield model is incompatible with chunk_long_sequences = True in config.")
+
     def _get_input_pipeline(self):
         return MultiFieldClassificationPipeline(self.config)
 

diff --git a/finetune/target_models/ordinal_regressor.py b/finetune/target_models/ordinal_regressor.py
@@ -50,7 +50,20 @@ def predict(self, X):
         :param X: list or array of text to embed.
         :returns: list of class labels.
         """
-        return super().predict(X).tolist()
+        all_labels=[]
+        for _, start_of_doc, end_of_doc, label, _ in self.process_long_sequence(X):
+            if start_of_doc:
+                # if this is the first chunk in a document, start accumulating from scratch
+                doc_labels = []
+
+            doc_labels.append(label)
+
+            if end_of_doc:
+                # last chunk in a document
+                means = np.mean(doc_labels, axis=0)
+                label = self.input_pipeline.label_encoder.inverse_transform([means])
+                all_labels.append(list(label))
+        return all_labels
 
     def predict_proba(self, X):
         """

diff --git a/finetune/target_models/regressor.py b/finetune/target_models/regressor.py
@@ -42,7 +42,7 @@ def predict(self, X):
         :returns: list of class labels.
         """
         all_labels=[]
-        for _, start_of_doc, end_of_doc, label, _ in self.process_long_sequence(X, task='regression'):
+        for _, start_of_doc, end_of_doc, label, _ in self.process_long_sequence(X):
             if start_of_doc:
                 # if this is the first chunk in a document, start accumulating from scratch
                 doc_labels = []

diff --git a/finetune/target_models/sequence_labeling.py b/finetune/target_models/sequence_labeling.py
@@ -147,9 +147,7 @@ class SequenceLabeler(BaseModel):
 
     defaults = {
         "n_epochs": 5,
-        "lr_warmup": 0.1,
-        "low_memory_mode": True,
-        "chunk_long_sequences": True
+        "lr_warmup": 0.1
     }
 
     def __init__(self, **kwargs):
@@ -203,7 +201,7 @@ def predict(self, X, per_token=False):
         chunk_size = self.config.max_length - 2
         step_size = chunk_size // 3
         doc_idx = -1
-        for position_seq, start_of_doc, end_of_doc, label_seq, proba_seq in self.process_long_sequence(X, task='sequence_labeling'):
+        for position_seq, start_of_doc, end_of_doc, label_seq, proba_seq in self.process_long_sequence(X, probas=True):
             start, end = 0, None
             if start_of_doc:
                 # if this is the first chunk in a document, start accumulating from scratch