Adding Control for Annotators with One Column (#12997)

* SPARKNLP-632 Adding control for annotators with only one expected column in Python side * SPARKNLP-632 Adding inputAnnotatorTypes and optionalInputAnnotatorTypes in annotators for Python * SPARKNLP-632 Removing useless prints to terminal * SPARKNLP-632 Adding AnnotatorType initialization in Python * SPARKNLP-632 Removing wrong imports
JohnSnowLabs · Nov 10, 2022 · 902b07d · 902b07d
1 parent 288afaf
commit 902b07d
Show file tree

Hide file tree

Showing 97 changed files with 520 additions and 10 deletions.
diff --git a/python/sparknlp/annotator/audio/wav2vec2_for_ctc.py b/python/sparknlp/annotator/audio/wav2vec2_for_ctc.py
@@ -15,6 +15,7 @@
 """Contains classes concerning Wav2Vec2ForCTC."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class Wav2Vec2ForCTC(AnnotatorModel,
@@ -86,6 +87,8 @@ class Wav2Vec2ForCTC(AnnotatorModel,
     """
     name = "Wav2Vec2ForCTC"
 
+    inputAnnotatorTypes = [AnnotatorType.AUDIO]
+
     configProtoBytes = Param(Params._dummy(),
                              "configProtoBytes",
                              "ConfigProto from tensorflow, serialized into byte array. Get with "

diff --git a/python/sparknlp/annotator/chunker.py b/python/sparknlp/annotator/chunker.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 """Contains classes for the Chunker."""
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class Chunker(AnnotatorModel):
@@ -100,6 +101,7 @@ class Chunker(AnnotatorModel):
     --------
     PerceptronModel : for Part-Of-Speech tagging
     """
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS]
 
     regexParsers = Param(Params._dummy(),
                          "regexParsers",

diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class AlbertForQuestionAnswering(AnnotatorModel,
@@ -87,6 +88,8 @@ class AlbertForQuestionAnswering(AnnotatorModel,
     """
     name = "AlbertForQuestionAnswering"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py
@@ -14,6 +14,7 @@
 """Contains classes concerning AlbertForSequenceClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class AlbertForSequenceClassification(AnnotatorModel,
@@ -101,6 +102,8 @@ class AlbertForSequenceClassification(AnnotatorModel,
     """
     name = "AlbertForSequenceClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py
@@ -14,6 +14,7 @@
 """Contains classes for AlbertForTokenClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class AlbertForTokenClassification(AnnotatorModel,
@@ -96,6 +97,8 @@ class AlbertForTokenClassification(AnnotatorModel,
 
     name = "AlbertForTokenClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class BertForQuestionAnswering(AnnotatorModel,
@@ -87,6 +88,8 @@ class BertForQuestionAnswering(AnnotatorModel,
     """
     name = "BertForQuestionAnswering"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py
@@ -14,6 +14,7 @@
 """Contains classes for BertForSequenceClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class BertForSequenceClassification(AnnotatorModel,
@@ -101,6 +102,8 @@ class per document by averaging probabilities in all sentences, by
     """
     name = "BertForSequenceClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py
@@ -14,6 +14,7 @@
 """Contains classes for BertForTokenClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class BertForTokenClassification(AnnotatorModel,
@@ -94,6 +95,8 @@ class BertForTokenClassification(AnnotatorModel,
     """
     name = "BertForTokenClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 """Contains classes for CamemBertForTokenClassification."""
-
 from sparknlp.common import *
 
 
@@ -91,6 +90,8 @@ class CamemBertForTokenClassification(AnnotatorModel,
     """
     name = "CamemBertForTokenClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/classifier_dl.py b/python/sparknlp/annotator/classifier_dl/classifier_dl.py
@@ -16,6 +16,7 @@
 from sparknlp.annotator.param import EvaluationDLParams, ClassifierEncoder
 from sparknlp.base import DocumentAssembler
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class ClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEncoder):
@@ -114,6 +115,7 @@ class ClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEnco
     MultiClassifierDLApproach : for multi-class classification
     SentimentDLApproach : for sentiment analysis
     """
+    inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS]
 
     dropout = Param(Params._dummy(), "dropout", "Dropout coefficient", TypeConverters.toFloat)
 
@@ -235,6 +237,8 @@ class ClassifierDLModel(AnnotatorModel, HasStorageRef, HasEngine):
 
     name = "ClassifierDLModel"
 
+    inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS]
+
     def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLModel", java_model=None):
         super(ClassifierDLModel, self).__init__(
             classname=classname,

diff --git a/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class DeBertaForQuestionAnswering(AnnotatorModel,
@@ -87,6 +88,8 @@ class DeBertaForQuestionAnswering(AnnotatorModel,
     """
     name = "DeBertaForQuestionAnswering"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py
@@ -12,7 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 """Contains classes for DeBertaForSequenceClassification."""
-
 from sparknlp.common import *
 
 
@@ -98,6 +97,8 @@ class DeBertaForSequenceClassification(AnnotatorModel,
     """
     name = "DeBertaForSequenceClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py
@@ -14,6 +14,7 @@
 """Contains classes for DeBertaForTokenClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class DeBertaForTokenClassification(AnnotatorModel,
@@ -94,6 +95,8 @@ class DeBertaForTokenClassification(AnnotatorModel,
     """
     name = "DeBertaForTokenClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class DistilBertForQuestionAnswering(AnnotatorModel,
@@ -87,6 +88,8 @@ class DistilBertForQuestionAnswering(AnnotatorModel,
     """
     name = "DistilBertForQuestionAnswering"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py
@@ -14,6 +14,7 @@
 """Contains classes for DistilBertForSequenceClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class DistilBertForSequenceClassification(AnnotatorModel,
@@ -101,6 +102,8 @@ class DistilBertForSequenceClassification(AnnotatorModel,
     """
     name = "DistilBertForSequenceClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py
@@ -14,6 +14,7 @@
 """Contains classes for DistilBertForTokenClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class DistilBertForTokenClassification(AnnotatorModel,
@@ -92,6 +93,8 @@ class DistilBertForTokenClassification(AnnotatorModel,
     """
     name = "DistilBertForTokenClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class LongformerForQuestionAnswering(AnnotatorModel,
@@ -87,6 +88,8 @@ class LongformerForQuestionAnswering(AnnotatorModel,
     """
     name = "LongformerForQuestionAnswering"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py
@@ -14,6 +14,7 @@
 """Contains classes for LongformerForSequenceClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class LongformerForSequenceClassification(AnnotatorModel,
@@ -101,6 +102,8 @@ class LongformerForSequenceClassification(AnnotatorModel,
     """
     name = "LongformerForSequenceClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py
@@ -14,6 +14,7 @@
 """Contains classes for LongformerForTokenClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class LongformerForTokenClassification(AnnotatorModel,
@@ -93,6 +94,8 @@ class LongformerForTokenClassification(AnnotatorModel,
 
     name = "LongformerForTokenClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/multi_classifier_dl.py b/python/sparknlp/annotator/classifier_dl/multi_classifier_dl.py
@@ -16,6 +16,7 @@
 from sparknlp.annotator.param import EvaluationDLParams, ClassifierEncoder
 from sparknlp.annotator.classifier_dl import ClassifierDLModel
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class MultiClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEncoder):
@@ -143,6 +144,7 @@ class MultiClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, Classifie
     ClassifierDLApproach : for single-class classification
     SentimentDLApproach : for sentiment analysis
     """
+    inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS]
 
     shufflePerEpoch = Param(Params._dummy(), "shufflePerEpoch", "whether to shuffle the training data on each Epoch",
                             TypeConverters.toBoolean)
@@ -290,6 +292,8 @@ class MultiClassifierDLModel(AnnotatorModel, HasStorageRef, HasEngine):
     """
     name = "MultiClassifierDLModel"
 
+    inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS]
+
     def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.MultiClassifierDLModel",
                  java_model=None):
         super(MultiClassifierDLModel, self).__init__(

diff --git a/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class RoBertaForQuestionAnswering(AnnotatorModel,
@@ -87,6 +88,8 @@ class RoBertaForQuestionAnswering(AnnotatorModel,
     """
     name = "RoBertaForQuestionAnswering"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",

diff --git a/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py
@@ -14,6 +14,7 @@
 """Contains classes for RoBertaForSequenceClassification."""
 
 from sparknlp.common import *
+from sparknlp.common.annotator_type import AnnotatorType
 
 
 class RoBertaForSequenceClassification(AnnotatorModel,
@@ -101,6 +102,8 @@ class RoBertaForSequenceClassification(AnnotatorModel,
     """
     name = "RoBertaForSequenceClassification"
 
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
     maxSentenceLength = Param(Params._dummy(),
                               "maxSentenceLength",
                               "Max sentence length to process",