Skip to content

Commit

Permalink
FIX: spacy tokens for deployment
Browse files Browse the repository at this point in the history
  • Loading branch information
madisonmay committed May 23, 2019
1 parent 858643c commit 0bd32d8
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 25 deletions.
5 changes: 3 additions & 2 deletions finetune/datasets/reuters.py
Expand Up @@ -83,5 +83,6 @@ def download(self):
)
model = SequenceLabeler(batch_size=2, val_size=0., chunk_long_sequences=True)
model.fit(trainX, trainY)
predictions = model.predict(testX)
annotation_report(testY, predictions))
model.save('Reuters.jl')
# predictions = model.predict(testX)
# annotation_report(testY, predictions)
18 changes: 10 additions & 8 deletions finetune/encoding/sequence_encoder.py
@@ -1,6 +1,7 @@
import warnings

from finetune.util.logging import truncate_text
from finetune.encoding.input_encoder import NLP


def assign_associations(labels, associations, none_value):
Expand Down Expand Up @@ -75,10 +76,10 @@ def finetune_to_indico_sequence(raw_texts, subseqs, labels, encoder=None, probs=
loop_vals = zip(raw_texts, subseqs, labels, probs or [None] * len(raw_texts), assoc_cleaned)
for doc_idx, (raw_text, doc_seq, label_seq, prob_seq, associations_seq) in enumerate(loop_vals):
tokens = encoded_docs.tokens[doc_idx]
token_ends = encoded_docs.char_locs[doc_idx]
token_lengths = [encoder._token_length(token) for token in tokens]
token_starts = [end - length for end, length in zip(token_ends, token_lengths)]
n_tokens = len(tokens)
spacy_tokens = NLP(raw_text)
spacy_token_starts = [token.idx for token in spacy_tokens]
spacy_token_ends = [token.idx + len(token.text) for token in spacy_tokens]
n_spacy_tokens = len(spacy_tokens)

doc_annotations = []
annotation_ranges = set()
Expand Down Expand Up @@ -119,14 +120,15 @@ def finetune_to_indico_sequence(raw_texts, subseqs, labels, encoder=None, probs=
if multi_label:
start_idx = 0
end_idx = 0

if label != none_value:
# round to nearest token
while start_idx < n_tokens and annotation_start >= token_starts[start_idx]:
while start_idx < n_spacy_tokens and annotation_start >= spacy_token_starts[start_idx]:
start_idx += 1
annotation_start = token_starts[start_idx - 1]
while end_idx < (n_tokens - 1) and annotation_end > token_ends[end_idx]:
annotation_start = spacy_token_starts[start_idx - 1]
while end_idx < (n_spacy_tokens - 1) and annotation_end > spacy_token_ends[end_idx]:
end_idx += 1
annotation_end = token_ends[end_idx]
annotation_end = spacy_token_ends[end_idx]

text = raw_text[annotation_start:annotation_end]
if label != none_value:
Expand Down
86 changes: 73 additions & 13 deletions finetune/target_models/sequence_labeling.py
Expand Up @@ -9,7 +9,7 @@
from finetune.nn.target_blocks import sequence_labeler
from finetune.nn.crf import sequence_decode
from finetune.encoding.sequence_encoder import indico_to_finetune_sequence, finetune_to_indico_sequence

from finetune.encoding.input_encoder import NLP
from finetune.input_pipeline import BasePipeline


Expand Down Expand Up @@ -67,6 +67,69 @@ def _target_encoder(self):
return SequenceLabelingEncoder()


def _combine_and_format(subtokens, start, end, raw_text):
"""
Combine predictions on many subtokens into a single token prediction.
Currently only valid for GPT.
"""
result = {
'start': start,
'end': end
}
result['text'] = raw_text[result['start']:result['end']]
probabilities = {}
keys = subtokens[0]['probabilities'].keys()
for k in keys:
probabilities[k] = np.mean([token['probabilities'][k] for token in subtokens])
result['probabilities'] = probabilities
max_response = max(probabilities.items(), key=lambda x: x[1])
result['label'] = max_response[0]
result['confidence'] = max_response[1]
return result


def _spacy_token_predictions(raw_text, tokens, probas, positions):
"""
Go from GPT subtoken level predictions, to spacy token predictions
"""
to_combine = []
spacy_attn = []

starts, ends = zip(*[(token.idx, token.idx + len(token.text)) for token in NLP(raw_text)])
spacy_token_idx = 0

spacy_token_starts = []
spacy_token_ends = []
spacy_results = []

for token, prob, (start, end) in zip(tokens, probas, positions):
to_combine.append({
'start': start,
'end': end,
'token': token,
'probabilities': prob
})

try:
end_match = ends.index(end, spacy_token_idx)
start, end = starts[end_match], end
spacy_token_idx = end_match
except ValueError:
continue

spacy_results.append(
_combine_and_format(
to_combine,
start=start,
end=end,
raw_text=raw_text
)
)
to_combine = []

return spacy_results


class SequenceLabeler(BaseModel):
"""
Labels each token in a sequence as belonging to 1 of N token classes.
Expand Down Expand Up @@ -209,18 +272,15 @@ def predict(self, X, per_token=False):

if per_token:
return [
[
{
'text': token,
'label': label,
'start': position[0],
'end': position[1],
'probabilities': proba,
'confidence': max(proba.values())
}
for token, label, proba, position in zip(tokens, labels, probas, positions)
]
for tokens, labels, probas, positions in zip(all_subseqs, all_labels, all_probs, all_positions)
_spacy_token_predictions(
raw_text=raw_text,
tokens=tokens,
probas=probas,
positions=positions
)
for raw_text, tokens, labels, probas, positions in zip(
X, all_subseqs, all_labels, all_probs, all_positions
)
]
else:
_, doc_annotations = finetune_to_indico_sequence(
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Expand Up @@ -5,7 +5,6 @@ scipy>=1.1.0
scikit-learn>=0.20.2
ftfy>=4.4.0
spacy>=2.0.0
msgpack-numpy>=0.4.1
pytest>=3.6.3
h5py>=2.8.0
joblib>=0.12.0
Expand Down
1 change: 0 additions & 1 deletion setup.py
Expand Up @@ -14,7 +14,6 @@
"scikit-learn>=0.18.0",
"ftfy>=4.4.0",
"spacy>=2.0.0",
"msgpack-numpy==0.4.1",
"h5py>=2.8.0",
"joblib>=0.12.0",
"bs4>=0.0.1",
Expand Down

0 comments on commit 0bd32d8

Please sign in to comment.