Skip to content

Commit

Permalink
FIX: Check current task without try/except
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewbayer authored and madisonmay committed Jul 17, 2019
1 parent e9eb9d3 commit 9b2f98b
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 35 deletions.
28 changes: 14 additions & 14 deletions finetune/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import math
from abc import ABCMeta, abstractmethod
from copy import deepcopy
from enum import Enum
import tempfile
import time
import sys
Expand Down Expand Up @@ -716,26 +717,25 @@ def finetune_grid_search_cv(cls, Xs, Y, *, n_splits, test_size, eval_fn=None, pr

return max(aggregated_results, key=lambda x: x[1])[0]

def process_long_sequence(self, X, task, probas=False):
tasks = ['sequence_labeling', 'classification', 'regression']
assert task in tasks, 'invalid task for processing long sequences'
class Task(Enum):
SEQUENCE_LABELING = 'sequence_labeling'


def process_long_sequence(self, X, probas=False):
chunk_size = self.config.max_length - 2
step_size = chunk_size // 3
if task == 'sequence_labeling':
arr_encoded = list(itertools.chain.from_iterable(self.input_pipeline._text_to_ids([x]) for x in X))
else:
arr_encoded = list(itertools.chain.from_iterable(self.input_pipeline._text_to_ids(x) for x in X))

arr_encoded = list(itertools.chain.from_iterable(self.input_pipeline._text_to_ids(x)
for x in self.input_pipeline._format_for_inference(X)))

labels, batch_probas = [], []
pred_keys = [PredictMode.NORMAL]
if task != 'regression':
if probas:
pred_keys.append(PredictMode.PROBAS)
for pred in self._inference(X, predict_keys=[PredictMode.PROBAS, PredictMode.NORMAL], n_examples=len(arr_encoded)):
try:
labels.append(self.input_pipeline.label_encoder.inverse_transform([pred[PredictMode.NORMAL]]))
except ValueError:
labels.append(self.input_pipeline.label_encoder.inverse_transform(pred[PredictMode.NORMAL]))
if task != 'regression':
labels.append(self.input_pipeline.label_encoder.inverse_transform(
pred[PredictMode.NORMAL] if hasattr(self,'multi_label') else [pred[PredictMode.NORMAL]] # only wrap in list if not sequence labeling
))
if probas:
batch_probas.append(pred[PredictMode.PROBAS])

if not batch_probas:
Expand Down
4 changes: 2 additions & 2 deletions finetune/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def get_default_config():
"""
settings = Settings(
# General Settings
low_memory_mode=False,
low_memory_mode=True,
interpolate_pos_embed=False,
save_adam_vars=True,
shuffle_buffer_size=100,
Expand Down Expand Up @@ -302,7 +302,7 @@ def get_default_config():
subtoken_predictions=False,
multi_label_sequences=False,
multi_label_threshold=0.5,
chunk_long_sequences=False,
chunk_long_sequences=True,

# Regression Params
regression_loss="L2",
Expand Down
17 changes: 4 additions & 13 deletions finetune/target_models/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,8 @@ class Classifier(BaseModel):
:param \**kwargs: key-value pairs of config items to override.
"""

defaults = {
"low_memory_mode": True,
"chunk_long_sequences": True
}

def __init__(self, **kwargs):
d = copy.deepcopy(Classifier.defaults)
d.update(kwargs)
super().__init__(**d)
super().__init__(**kwargs)

def _get_input_pipeline(self):
return ClassificationPipeline(self.config)
Expand All @@ -55,7 +48,7 @@ def featurize(self, X):
"""
return super().featurize(X)

def predict(self, X, probas=False):
def predict(self, X, probas=True):
"""
Produces a list of most likely class labels as determined by the fine-tuned model.
Expand All @@ -69,14 +62,12 @@ def predict(self, X, probas=False):
all_labels = []
all_probs = []

for _, start_of_doc, end_of_doc, label, proba in self.process_long_sequence(X, task='classification'):
for _, start_of_doc, end_of_doc, _, proba in self.process_long_sequence(X, probas):
start, end = 0, None
if start_of_doc:
# if this is the first chunk in a document, start accumulating from scratch
doc_labels = []
doc_probs = []

doc_labels.append(label)
doc_probs.append(proba)

if end_of_doc:
Expand All @@ -101,7 +92,7 @@ def predict_proba(self, X):
:param X: list or array of text to embed.
:returns: list of dictionaries. Each dictionary maps from a class label to its assigned class probability.
"""
return self.predict(X, probas=True)
return self.predict(X)

def finetune(self, X, Y=None, batch_size=None):
"""
Expand Down
11 changes: 11 additions & 0 deletions finetune/target_models/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ class Comparison(Classifier):
:param \**kwargs: key-value pairs of config items to override.
"""

defaults = {
"chunk_long_sequences": False
}

def __init__(self, **kwargs):
d = copy.deepcopy(Comparison.defaults)
d.update(kwargs)
super().__init__(**d)
if self.config.chunk_long_sequences:
raise FinetuneError("Multifield model is incompatible with chunk_long_sequences = True in config.")

def _get_input_pipeline(self):
return ComparisonPipeline(self.config)

Expand Down
11 changes: 11 additions & 0 deletions finetune/target_models/multifield.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,17 @@ class MultiFieldClassifier(Classifier):
:param \**kwargs: key-value pairs of config items to override.
"""

defaults = {
"chunk_long_sequences": False
}

def __init__(self, **kwargs):
d = copy.deepcopy(MultifieldClassifier.defaults)
d.update(kwargs)
super().__init__(**d)
if self.config.chunk_long_sequences:
raise FinetuneError("Multifield model is incompatible with chunk_long_sequences = True in config.")

def _get_input_pipeline(self):
return MultiFieldClassificationPipeline(self.config)

Expand Down
15 changes: 14 additions & 1 deletion finetune/target_models/ordinal_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,20 @@ def predict(self, X):
:param X: list or array of text to embed.
:returns: list of class labels.
"""
return super().predict(X).tolist()
all_labels=[]
for _, start_of_doc, end_of_doc, label, _ in self.process_long_sequence(X):
if start_of_doc:
# if this is the first chunk in a document, start accumulating from scratch
doc_labels = []

doc_labels.append(label)

if end_of_doc:
# last chunk in a document
means = np.mean(doc_labels, axis=0)
label = self.input_pipeline.label_encoder.inverse_transform([means])
all_labels.append(list(label))
return all_labels

def predict_proba(self, X):
"""
Expand Down
2 changes: 1 addition & 1 deletion finetune/target_models/regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def predict(self, X):
:returns: list of class labels.
"""
all_labels=[]
for _, start_of_doc, end_of_doc, label, _ in self.process_long_sequence(X, task='regression'):
for _, start_of_doc, end_of_doc, label, _ in self.process_long_sequence(X):
if start_of_doc:
# if this is the first chunk in a document, start accumulating from scratch
doc_labels = []
Expand Down
6 changes: 2 additions & 4 deletions finetune/target_models/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,7 @@ class SequenceLabeler(BaseModel):

defaults = {
"n_epochs": 5,
"lr_warmup": 0.1,
"low_memory_mode": True,
"chunk_long_sequences": True
"lr_warmup": 0.1
}

def __init__(self, **kwargs):
Expand Down Expand Up @@ -203,7 +201,7 @@ def predict(self, X, per_token=False):
chunk_size = self.config.max_length - 2
step_size = chunk_size // 3
doc_idx = -1
for position_seq, start_of_doc, end_of_doc, label_seq, proba_seq in self.process_long_sequence(X, task='sequence_labeling'):
for position_seq, start_of_doc, end_of_doc, label_seq, proba_seq in self.process_long_sequence(X, probas=True):
start, end = 0, None
if start_of_doc:
# if this is the first chunk in a document, start accumulating from scratch
Expand Down

0 comments on commit 9b2f98b

Please sign in to comment.