FIX: spacy tokens for deployment

IndicoDataSolutions · May 23, 2019 · 0bd32d8 · 0bd32d8
1 parent 858643c
commit 0bd32d8
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 25 deletions.
diff --git a/finetune/datasets/reuters.py b/finetune/datasets/reuters.py
@@ -83,5 +83,6 @@ def download(self):
     )
     model = SequenceLabeler(batch_size=2, val_size=0., chunk_long_sequences=True)
     model.fit(trainX, trainY)
-    predictions = model.predict(testX)
-    annotation_report(testY, predictions))
+    model.save('Reuters.jl')
+    # predictions = model.predict(testX)
+    # annotation_report(testY, predictions)
diff --git a/finetune/encoding/sequence_encoder.py b/finetune/encoding/sequence_encoder.py
@@ -1,6 +1,7 @@
 import warnings
 
 from finetune.util.logging import truncate_text
+from finetune.encoding.input_encoder import NLP
 
 
 def assign_associations(labels, associations, none_value):
@@ -75,10 +76,10 @@ def finetune_to_indico_sequence(raw_texts, subseqs, labels, encoder=None, probs=
     loop_vals = zip(raw_texts, subseqs, labels, probs or [None] * len(raw_texts), assoc_cleaned)
     for doc_idx, (raw_text, doc_seq, label_seq, prob_seq, associations_seq) in enumerate(loop_vals):
         tokens = encoded_docs.tokens[doc_idx]
-        token_ends = encoded_docs.char_locs[doc_idx]
-        token_lengths = [encoder._token_length(token) for token in tokens]
-        token_starts = [end - length for end, length in zip(token_ends, token_lengths)]
-        n_tokens = len(tokens)
+        spacy_tokens = NLP(raw_text)
+        spacy_token_starts = [token.idx for token in spacy_tokens]
+        spacy_token_ends = [token.idx + len(token.text) for token in spacy_tokens]
+        n_spacy_tokens = len(spacy_tokens)
 
         doc_annotations = []
         annotation_ranges = set()
@@ -119,14 +120,15 @@ def finetune_to_indico_sequence(raw_texts, subseqs, labels, encoder=None, probs=
                     if multi_label:
                         start_idx = 0
                         end_idx = 0
+
                     if label != none_value:
                         # round to nearest token
-                        while start_idx < n_tokens and annotation_start >= token_starts[start_idx]:
+                        while start_idx < n_spacy_tokens and annotation_start >= spacy_token_starts[start_idx]:
                             start_idx += 1
-                        annotation_start = token_starts[start_idx - 1]
-                        while end_idx < (n_tokens - 1) and annotation_end > token_ends[end_idx]:
+                        annotation_start = spacy_token_starts[start_idx - 1]
+                        while end_idx < (n_spacy_tokens - 1) and annotation_end > spacy_token_ends[end_idx]:
                             end_idx += 1
-                        annotation_end = token_ends[end_idx]
+                        annotation_end = spacy_token_ends[end_idx]
 
                 text = raw_text[annotation_start:annotation_end]
                 if label != none_value:

diff --git a/finetune/target_models/sequence_labeling.py b/finetune/target_models/sequence_labeling.py
@@ -9,7 +9,7 @@
 from finetune.nn.target_blocks import sequence_labeler
 from finetune.nn.crf import sequence_decode
 from finetune.encoding.sequence_encoder import indico_to_finetune_sequence, finetune_to_indico_sequence
-
+from finetune.encoding.input_encoder import NLP
 from finetune.input_pipeline import BasePipeline
 
 
@@ -67,6 +67,69 @@ def _target_encoder(self):
         return SequenceLabelingEncoder()
 
 
+def _combine_and_format(subtokens, start, end, raw_text):
+    """
+    Combine predictions on many subtokens into a single token prediction.
+    Currently only valid for GPT.
+    """
+    result = {
+        'start': start, 
+        'end': end
+    }
+    result['text'] = raw_text[result['start']:result['end']]
+    probabilities = {}
+    keys = subtokens[0]['probabilities'].keys()
+    for k in keys:
+        probabilities[k] = np.mean([token['probabilities'][k] for token in subtokens])
+    result['probabilities'] = probabilities
+    max_response = max(probabilities.items(), key=lambda x: x[1])
+    result['label'] = max_response[0]
+    result['confidence'] = max_response[1]
+    return result
+
+
+def _spacy_token_predictions(raw_text, tokens, probas, positions):
+    """
+    Go from GPT subtoken level predictions, to spacy token predictions
+    """
+    to_combine = []
+    spacy_attn = []
+
+    starts, ends = zip(*[(token.idx, token.idx + len(token.text)) for token in NLP(raw_text)])
+    spacy_token_idx = 0
+
+    spacy_token_starts = []
+    spacy_token_ends = []
+    spacy_results = []
+
+    for token, prob, (start, end) in zip(tokens, probas, positions):
+        to_combine.append({
+            'start': start,
+            'end': end,
+            'token': token,
+            'probabilities': prob
+        })
+
+        try:
+            end_match = ends.index(end, spacy_token_idx)
+            start, end = starts[end_match], end
+            spacy_token_idx = end_match
+        except ValueError:
+            continue
+
+        spacy_results.append(
+            _combine_and_format(
+                to_combine,
+                start=start, 
+                end=end,
+                raw_text=raw_text
+            )
+        )
+        to_combine = []
+
+    return spacy_results
+
+
 class SequenceLabeler(BaseModel):
     """
     Labels each token in a sequence as belonging to 1 of N token classes.
@@ -209,18 +272,15 @@ def predict(self, X, per_token=False):
 
         if per_token:
             return [
-                [
-                    {
-                        'text': token,
-                        'label': label,
-                        'start': position[0],
-                        'end': position[1],
-                        'probabilities': proba,
-                        'confidence': max(proba.values())
-                    }
-                    for token, label, proba, position in zip(tokens, labels, probas, positions)
-                ]
-                for tokens, labels, probas, positions in zip(all_subseqs, all_labels, all_probs, all_positions)
+                _spacy_token_predictions(
+                    raw_text=raw_text,
+                    tokens=tokens,
+                    probas=probas, 
+                    positions=positions
+                )
+                for raw_text, tokens, labels, probas, positions in zip(
+                    X, all_subseqs, all_labels, all_probs, all_positions
+                )
             ]
         else:
             _, doc_annotations = finetune_to_indico_sequence(

diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,6 @@ scipy>=1.1.0
 scikit-learn>=0.20.2
 ftfy>=4.4.0
 spacy>=2.0.0
-msgpack-numpy>=0.4.1
 pytest>=3.6.3
 h5py>=2.8.0
 joblib>=0.12.0

diff --git a/setup.py b/setup.py
@@ -14,7 +14,6 @@
     "scikit-learn>=0.18.0",
     "ftfy>=4.4.0",
     "spacy>=2.0.0",
-    "msgpack-numpy==0.4.1",
     "h5py>=2.8.0",
     "joblib>=0.12.0",
     "bs4>=0.0.1",