diff --git a/python/sparknlp/annotator/audio/wav2vec2_for_ctc.py b/python/sparknlp/annotator/audio/wav2vec2_for_ctc.py index 848555745970..65ab191408d4 100644 --- a/python/sparknlp/annotator/audio/wav2vec2_for_ctc.py +++ b/python/sparknlp/annotator/audio/wav2vec2_for_ctc.py @@ -15,6 +15,7 @@ """Contains classes concerning Wav2Vec2ForCTC.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class Wav2Vec2ForCTC(AnnotatorModel, @@ -86,6 +87,8 @@ class Wav2Vec2ForCTC(AnnotatorModel, """ name = "Wav2Vec2ForCTC" + inputAnnotatorTypes = [AnnotatorType.AUDIO] + configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with " diff --git a/python/sparknlp/annotator/chunker.py b/python/sparknlp/annotator/chunker.py index 1c44a36a44cd..4dfe53ed17ec 100755 --- a/python/sparknlp/annotator/chunker.py +++ b/python/sparknlp/annotator/chunker.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains classes for the Chunker.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class Chunker(AnnotatorModel): @@ -100,6 +101,7 @@ class Chunker(AnnotatorModel): -------- PerceptronModel : for Part-Of-Speech tagging """ + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS] regexParsers = Param(Params._dummy(), "regexParsers", diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py index c611612ccf82..e2a21323f3f2 100755 --- a/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py @@ -13,6 +13,7 @@ # limitations under the License. from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class AlbertForQuestionAnswering(AnnotatorModel, @@ -87,6 +88,8 @@ class AlbertForQuestionAnswering(AnnotatorModel, """ name = "AlbertForQuestionAnswering" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py index 012fd3274045..2d79ab96e453 100755 --- a/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py @@ -14,6 +14,7 @@ """Contains classes concerning AlbertForSequenceClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class AlbertForSequenceClassification(AnnotatorModel, @@ -101,6 +102,8 @@ class AlbertForSequenceClassification(AnnotatorModel, """ name = "AlbertForSequenceClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py index f9670bd1cdb3..7d668a49e53a 100755 --- a/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py @@ -14,6 +14,7 @@ """Contains classes for AlbertForTokenClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class AlbertForTokenClassification(AnnotatorModel, @@ -96,6 +97,8 @@ class AlbertForTokenClassification(AnnotatorModel, name = "AlbertForTokenClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py index 0eeb5d135eb6..0a770b2fde5a 100755 --- a/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py @@ -13,6 +13,7 @@ # limitations under the License. from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class BertForQuestionAnswering(AnnotatorModel, @@ -87,6 +88,8 @@ class BertForQuestionAnswering(AnnotatorModel, """ name = "BertForQuestionAnswering" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py index 7e16107927ea..cc8443a6dffa 100755 --- a/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py @@ -14,6 +14,7 @@ """Contains classes for BertForSequenceClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class BertForSequenceClassification(AnnotatorModel, @@ -101,6 +102,8 @@ class per document by averaging probabilities in all sentences, by """ name = "BertForSequenceClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py index f31f18dc933d..18f91c9c13e7 100755 --- a/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py @@ -14,6 +14,7 @@ """Contains classes for BertForTokenClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class BertForTokenClassification(AnnotatorModel, @@ -94,6 +95,8 @@ class BertForTokenClassification(AnnotatorModel, """ name = "BertForTokenClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py index c753ad4ef9c8..4115e05a96e8 100755 --- a/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains classes for CamemBertForTokenClassification.""" - from sparknlp.common import * @@ -91,6 +90,8 @@ class CamemBertForTokenClassification(AnnotatorModel, """ name = "CamemBertForTokenClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/classifier_dl.py b/python/sparknlp/annotator/classifier_dl/classifier_dl.py index 3566458867dc..53fde0609eee 100755 --- a/python/sparknlp/annotator/classifier_dl/classifier_dl.py +++ b/python/sparknlp/annotator/classifier_dl/classifier_dl.py @@ -16,6 +16,7 @@ from sparknlp.annotator.param import EvaluationDLParams, ClassifierEncoder from sparknlp.base import DocumentAssembler from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class ClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEncoder): @@ -114,6 +115,7 @@ class ClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEnco MultiClassifierDLApproach : for multi-class classification SentimentDLApproach : for sentiment analysis """ + inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS] dropout = Param(Params._dummy(), "dropout", "Dropout coefficient", TypeConverters.toFloat) @@ -235,6 +237,8 @@ class ClassifierDLModel(AnnotatorModel, HasStorageRef, HasEngine): name = "ClassifierDLModel" + inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.ClassifierDLModel", java_model=None): super(ClassifierDLModel, self).__init__( classname=classname, diff --git a/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py index fcd4d2d83a40..805a41ba2c9f 100755 --- a/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py @@ -13,6 +13,7 @@ # limitations under the License. from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class DeBertaForQuestionAnswering(AnnotatorModel, @@ -87,6 +88,8 @@ class DeBertaForQuestionAnswering(AnnotatorModel, """ name = "DeBertaForQuestionAnswering" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py index 358559293f6f..361de3536d1d 100755 --- a/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains classes for DeBertaForSequenceClassification.""" - from sparknlp.common import * @@ -98,6 +97,8 @@ class DeBertaForSequenceClassification(AnnotatorModel, """ name = "DeBertaForSequenceClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py index 3252b77554d3..65ca19b23531 100755 --- a/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py @@ -14,6 +14,7 @@ """Contains classes for DeBertaForTokenClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class DeBertaForTokenClassification(AnnotatorModel, @@ -94,6 +95,8 @@ class DeBertaForTokenClassification(AnnotatorModel, """ name = "DeBertaForTokenClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py index 3f21a233d326..92aeea159626 100755 --- a/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py @@ -13,6 +13,7 @@ # limitations under the License. from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class DistilBertForQuestionAnswering(AnnotatorModel, @@ -87,6 +88,8 @@ class DistilBertForQuestionAnswering(AnnotatorModel, """ name = "DistilBertForQuestionAnswering" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py index 1ef2d685c73e..08293dbb81b6 100755 --- a/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py @@ -14,6 +14,7 @@ """Contains classes for DistilBertForSequenceClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class DistilBertForSequenceClassification(AnnotatorModel, @@ -101,6 +102,8 @@ class DistilBertForSequenceClassification(AnnotatorModel, """ name = "DistilBertForSequenceClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py index 02b2834a6c2d..6e07553dc164 100755 --- a/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py @@ -14,6 +14,7 @@ """Contains classes for DistilBertForTokenClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class DistilBertForTokenClassification(AnnotatorModel, @@ -92,6 +93,8 @@ class DistilBertForTokenClassification(AnnotatorModel, """ name = "DistilBertForTokenClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py index 800440530450..570d1dea1b8a 100755 --- a/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py @@ -13,6 +13,7 @@ # limitations under the License. from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class LongformerForQuestionAnswering(AnnotatorModel, @@ -87,6 +88,8 @@ class LongformerForQuestionAnswering(AnnotatorModel, """ name = "LongformerForQuestionAnswering" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py index af3fd1a50490..e58f4c7862aa 100755 --- a/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py @@ -14,6 +14,7 @@ """Contains classes for LongformerForSequenceClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class LongformerForSequenceClassification(AnnotatorModel, @@ -101,6 +102,8 @@ class LongformerForSequenceClassification(AnnotatorModel, """ name = "LongformerForSequenceClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py index d0ec68bc4588..0f6812613861 100755 --- a/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py @@ -14,6 +14,7 @@ """Contains classes for LongformerForTokenClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class LongformerForTokenClassification(AnnotatorModel, @@ -93,6 +94,8 @@ class LongformerForTokenClassification(AnnotatorModel, name = "LongformerForTokenClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/multi_classifier_dl.py b/python/sparknlp/annotator/classifier_dl/multi_classifier_dl.py index 0239f0825fe6..96fa2718cad1 100755 --- a/python/sparknlp/annotator/classifier_dl/multi_classifier_dl.py +++ b/python/sparknlp/annotator/classifier_dl/multi_classifier_dl.py @@ -16,6 +16,7 @@ from sparknlp.annotator.param import EvaluationDLParams, ClassifierEncoder from sparknlp.annotator.classifier_dl import ClassifierDLModel from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class MultiClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEncoder): @@ -143,6 +144,7 @@ class MultiClassifierDLApproach(AnnotatorApproach, EvaluationDLParams, Classifie ClassifierDLApproach : for single-class classification SentimentDLApproach : for sentiment analysis """ + inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS] shufflePerEpoch = Param(Params._dummy(), "shufflePerEpoch", "whether to shuffle the training data on each Epoch", TypeConverters.toBoolean) @@ -290,6 +292,8 @@ class MultiClassifierDLModel(AnnotatorModel, HasStorageRef, HasEngine): """ name = "MultiClassifierDLModel" + inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.MultiClassifierDLModel", java_model=None): super(MultiClassifierDLModel, self).__init__( diff --git a/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py index bd2a2dd3d824..7afc5bdc8381 100755 --- a/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py @@ -13,6 +13,7 @@ # limitations under the License. from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class RoBertaForQuestionAnswering(AnnotatorModel, @@ -87,6 +88,8 @@ class RoBertaForQuestionAnswering(AnnotatorModel, """ name = "RoBertaForQuestionAnswering" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py index d40ed0f92351..04473961744b 100755 --- a/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py @@ -14,6 +14,7 @@ """Contains classes for RoBertaForSequenceClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class RoBertaForSequenceClassification(AnnotatorModel, @@ -101,6 +102,8 @@ class RoBertaForSequenceClassification(AnnotatorModel, """ name = "RoBertaForSequenceClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/roberta_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/roberta_for_token_classification.py index 5a17dece9661..9b57f80fba74 100755 --- a/python/sparknlp/annotator/classifier_dl/roberta_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/roberta_for_token_classification.py @@ -14,6 +14,7 @@ """Contains classes for RoBertaForTokenClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class RoBertaForTokenClassification(AnnotatorModel, @@ -92,6 +93,8 @@ class RoBertaForTokenClassification(AnnotatorModel, """ name = "RoBertaForTokenClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/sentiment_dl.py b/python/sparknlp/annotator/classifier_dl/sentiment_dl.py index fa52481d7678..4c9e8f3225d1 100755 --- a/python/sparknlp/annotator/classifier_dl/sentiment_dl.py +++ b/python/sparknlp/annotator/classifier_dl/sentiment_dl.py @@ -15,6 +15,7 @@ from sparknlp.annotator.param import EvaluationDLParams, ClassifierEncoder from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class SentimentDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEncoder): @@ -114,6 +115,8 @@ class SentimentDLApproach(AnnotatorApproach, EvaluationDLParams, ClassifierEncod >>> pipelineModel = pipeline.fit(smallCorpus) """ + inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS] + dropout = Param(Params._dummy(), "dropout", "Dropout coefficient", TypeConverters.toFloat) threshold = Param(Params._dummy(), "threshold", @@ -258,6 +261,8 @@ class SentimentDLModel(AnnotatorModel, HasStorageRef, HasEngine): """ name = "SentimentDLModel" + inputAnnotatorTypes = [AnnotatorType.SENTENCE_EMBEDDINGS] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.SentimentDLModel", java_model=None): super(SentimentDLModel, self).__init__( classname=classname, diff --git a/python/sparknlp/annotator/classifier_dl/tapas_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/tapas_for_question_answering.py index 783de9d78531..f81f2e4ebcc8 100644 --- a/python/sparknlp/annotator/classifier_dl/tapas_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/tapas_for_question_answering.py @@ -14,6 +14,7 @@ from sparknlp.common import * from sparknlp.annotator.classifier_dl import BertForQuestionAnswering +from sparknlp.common.annotator_type import AnnotatorType class TapasForQuestionAnswering(BertForQuestionAnswering): @@ -111,6 +112,8 @@ class TapasForQuestionAnswering(BertForQuestionAnswering): name = "TapasForQuestionAnswering" + inputAnnotatorTypes = [AnnotatorType.TABLE, AnnotatorType.DOCUMENT] + @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.TapasForQuestionAnswering", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py index 3bb04a2e3823..ddf16e098c9f 100755 --- a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py @@ -13,6 +13,7 @@ # limitations under the License. from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class XlmRoBertaForQuestionAnswering(AnnotatorModel, @@ -87,6 +88,8 @@ class XlmRoBertaForQuestionAnswering(AnnotatorModel, """ name = "XlmRoBertaForQuestionAnswering" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py index 0b5a6befbc61..aa2bccfb92ba 100755 --- a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py @@ -14,6 +14,7 @@ """Contains classes for XlmRoBertaForSequenceClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class XlmRoBertaForSequenceClassification(AnnotatorModel, @@ -101,6 +102,8 @@ class XlmRoBertaForSequenceClassification(AnnotatorModel, """ name = "XlmRoBertaForSequenceClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py index 7ce30fd9595e..1879c812e699 100755 --- a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py @@ -14,6 +14,7 @@ """Contains classes for XlmRoBertaForTokenClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class XlmRoBertaForTokenClassification(AnnotatorModel, @@ -90,6 +91,8 @@ class XlmRoBertaForTokenClassification(AnnotatorModel, """ name = "XlmRoBertaForTokenClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py index 017920850516..975c22b61b59 100755 --- a/python/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py @@ -14,6 +14,7 @@ """Contains classes for XlnetForSequenceClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class XlnetForSequenceClassification(AnnotatorModel, @@ -101,6 +102,8 @@ class XlnetForSequenceClassification(AnnotatorModel, """ name = "XlnetForSequenceClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py index 5a8322f6b6b8..604a07f80a28 100755 --- a/python/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py @@ -14,6 +14,7 @@ """Contains classes for XlnetForTokenClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class XlnetForTokenClassification(AnnotatorModel, @@ -93,6 +94,8 @@ class XlnetForTokenClassification(AnnotatorModel, name = "XlnetForTokenClassification" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/coref/spanbert_coref.py b/python/sparknlp/annotator/coref/spanbert_coref.py index f01112ea3f26..6b65404e9c08 100644 --- a/python/sparknlp/annotator/coref/spanbert_coref.py +++ b/python/sparknlp/annotator/coref/spanbert_coref.py @@ -14,6 +14,7 @@ """Contains classes for the SpanBertCorefModel.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class SpanBertCorefModel(AnnotatorModel, @@ -110,6 +111,8 @@ class SpanBertCorefModel(AnnotatorModel, name = "SpanBertCorefModel" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/cv/vit_for_image_classification.py b/python/sparknlp/annotator/cv/vit_for_image_classification.py index 5aff2a020801..43d67d7021b3 100644 --- a/python/sparknlp/annotator/cv/vit_for_image_classification.py +++ b/python/sparknlp/annotator/cv/vit_for_image_classification.py @@ -15,6 +15,7 @@ """Contains classes concerning ViTForImageClassification.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class ViTForImageClassification(AnnotatorModel, @@ -104,6 +105,8 @@ class ViTForImageClassification(AnnotatorModel, """ name = "ViTForImageClassification" + inputAnnotatorTypes = [AnnotatorType.IMAGE] + configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with " diff --git a/python/sparknlp/annotator/dependency/dependency_parser.py b/python/sparknlp/annotator/dependency/dependency_parser.py index e92343a5a2b3..cb7f146d660b 100755 --- a/python/sparknlp/annotator/dependency/dependency_parser.py +++ b/python/sparknlp/annotator/dependency/dependency_parser.py @@ -14,6 +14,7 @@ """Contains classes for the DependencyParser.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class DependencyParserApproach(AnnotatorApproach): @@ -94,6 +95,9 @@ class DependencyParserApproach(AnnotatorApproach): -------- TypedDependencyParserApproach : to extract labels for the dependencies """ + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS, AnnotatorType.TOKEN] + dependencyTreeBank = Param(Params._dummy(), "dependencyTreeBank", "Dependency treebank source files", @@ -159,6 +163,7 @@ def setConllU(self, path, read_as=ReadAs.TEXT, options={"key": "value"}): def _create_model(self, java_model): return DependencyParserModel(java_model=java_model) + class DependencyParserModel(AnnotatorModel): """Unlabeled parser that finds a grammatical relation between two words in a sentence. @@ -250,6 +255,8 @@ class DependencyParserModel(AnnotatorModel): """ name = "DependencyParserModel" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS, AnnotatorType.TOKEN] + perceptron = Param(Params._dummy(), "perceptron", "Dependency parsing perceptron features", diff --git a/python/sparknlp/annotator/dependency/typed_dependency_parser.py b/python/sparknlp/annotator/dependency/typed_dependency_parser.py index fa7857753010..54a67507720e 100755 --- a/python/sparknlp/annotator/dependency/typed_dependency_parser.py +++ b/python/sparknlp/annotator/dependency/typed_dependency_parser.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class TypedDependencyParserApproach(AnnotatorApproach): @@ -98,6 +99,9 @@ class TypedDependencyParserApproach(AnnotatorApproach): >>> emptyDataSet = spark.createDataFrame([[""]]).toDF("text") >>> pipelineModel = pipeline.fit(emptyDataSet) """ + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS, AnnotatorType.DEPENDENCY] + conll2009 = Param(Params._dummy(), "conll2009", "Path to file with CoNLL 2009 format", @@ -166,6 +170,7 @@ def setNumberOfIterations(self, value): def _create_model(self, java_model): return TypedDependencyParserModel(java_model=java_model) + class TypedDependencyParserModel(AnnotatorModel): """Labeled parser that finds a grammatical relation between two words in a sentence. Its input is either a CoNLL2009 or ConllU dataset. @@ -258,6 +263,8 @@ class TypedDependencyParserModel(AnnotatorModel): name = "TypedDependencyParserModel" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.POS, AnnotatorType.DEPENDENCY] + trainOptions = Param(Params._dummy(), "trainOptions", "Training Options", diff --git a/python/sparknlp/annotator/document_normalizer.py b/python/sparknlp/annotator/document_normalizer.py index 25ee012177b6..bdfce0f47bee 100755 --- a/python/sparknlp/annotator/document_normalizer.py +++ b/python/sparknlp/annotator/document_normalizer.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains classes for the DocumentNormalizer""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class DocumentNormalizer(AnnotatorModel): @@ -87,6 +88,7 @@ class DocumentNormalizer(AnnotatorModel): |[ the world's largest web developer site the world's largest web developer site lorem ipsum is simply dummy text of the printing and typesetting industry. lorem ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. it has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. it was popularised in the 1960s with the release of letraset sheets containing lorem ipsum passages, and more recently with desktop publishing software like aldus pagemaker including versions of lorem ipsum..]| +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ """ + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] action = Param(Params._dummy(), "action", diff --git a/python/sparknlp/annotator/embeddings/albert_embeddings.py b/python/sparknlp/annotator/embeddings/albert_embeddings.py index 7875fe09ed8a..13f47b32f02b 100755 --- a/python/sparknlp/annotator/embeddings/albert_embeddings.py +++ b/python/sparknlp/annotator/embeddings/albert_embeddings.py @@ -14,6 +14,7 @@ """Contains classes concerning AlbertEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class AlbertEmbeddings(AnnotatorModel, @@ -154,6 +155,8 @@ class AlbertEmbeddings(AnnotatorModel, name = "AlbertEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", diff --git a/python/sparknlp/annotator/embeddings/bert_embeddings.py b/python/sparknlp/annotator/embeddings/bert_embeddings.py index 221cfe168888..9636ad4524bf 100755 --- a/python/sparknlp/annotator/embeddings/bert_embeddings.py +++ b/python/sparknlp/annotator/embeddings/bert_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for BertEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class BertEmbeddings(AnnotatorModel, @@ -130,6 +131,8 @@ class BertEmbeddings(AnnotatorModel, name = "BertEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/embeddings/bert_sentence_embeddings.py b/python/sparknlp/annotator/embeddings/bert_sentence_embeddings.py index 7785092f42cd..d240d79f0330 100755 --- a/python/sparknlp/annotator/embeddings/bert_sentence_embeddings.py +++ b/python/sparknlp/annotator/embeddings/bert_sentence_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for BertSentenceEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class BertSentenceEmbeddings(AnnotatorModel, @@ -129,6 +130,8 @@ class BertSentenceEmbeddings(AnnotatorModel, name = "BertSentenceEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/embeddings/camembert_embeddings.py b/python/sparknlp/annotator/embeddings/camembert_embeddings.py index 30c3244be603..986074749e99 100755 --- a/python/sparknlp/annotator/embeddings/camembert_embeddings.py +++ b/python/sparknlp/annotator/embeddings/camembert_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for CamemBertEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class CamemBertEmbeddings(AnnotatorModel, @@ -132,6 +133,8 @@ class CamemBertEmbeddings(AnnotatorModel, name = "CamemBertEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + configProtoBytes = Param( Params._dummy(), "configProtoBytes", diff --git a/python/sparknlp/annotator/embeddings/chunk_embeddings.py b/python/sparknlp/annotator/embeddings/chunk_embeddings.py index df0bd3a09b11..203880127817 100755 --- a/python/sparknlp/annotator/embeddings/chunk_embeddings.py +++ b/python/sparknlp/annotator/embeddings/chunk_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for ChunkEmbeddings""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class ChunkEmbeddings(AnnotatorModel): @@ -98,6 +99,8 @@ class ChunkEmbeddings(AnnotatorModel): name = "ChunkEmbeddings" + inputAnnotatorTypes = [AnnotatorType.CHUNK, AnnotatorType.WORD_EMBEDDINGS] + @keyword_only def __init__(self): super(ChunkEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.ChunkEmbeddings") diff --git a/python/sparknlp/annotator/embeddings/deberta_embeddings.py b/python/sparknlp/annotator/embeddings/deberta_embeddings.py index 70a8b168a639..d72328ddde18 100755 --- a/python/sparknlp/annotator/embeddings/deberta_embeddings.py +++ b/python/sparknlp/annotator/embeddings/deberta_embeddings.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Contains classes for DeBertaEmbeddings.""" - from sparknlp.common import * @@ -133,6 +132,8 @@ class DeBertaEmbeddings(AnnotatorModel, name = "DeBertaEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", diff --git a/python/sparknlp/annotator/embeddings/distil_bert_embeddings.py b/python/sparknlp/annotator/embeddings/distil_bert_embeddings.py index 13abd80a2c43..fe0cb8c26a44 100755 --- a/python/sparknlp/annotator/embeddings/distil_bert_embeddings.py +++ b/python/sparknlp/annotator/embeddings/distil_bert_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for DistilBertEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class DistilBertEmbeddings(AnnotatorModel, @@ -145,6 +146,8 @@ class DistilBertEmbeddings(AnnotatorModel, name = "DistilBertEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/embeddings/doc2vec.py b/python/sparknlp/annotator/embeddings/doc2vec.py index 61f5e549eb37..fa94e686ccd4 100755 --- a/python/sparknlp/annotator/embeddings/doc2vec.py +++ b/python/sparknlp/annotator/embeddings/doc2vec.py @@ -14,6 +14,7 @@ """Contains classes for Doc2Vec.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class Doc2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperties): @@ -99,6 +100,7 @@ class Doc2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperti >>> dataset = spark.read.text(path).toDF("text") >>> pipelineModel = pipeline.fit(dataset) """ + inputAnnotatorTypes = [AnnotatorType.TOKEN] vectorSize = Param(Params._dummy(), "vectorSize", @@ -212,6 +214,7 @@ def __init__(self): def _create_model(self, java_model): return Doc2VecModel(java_model=java_model) + class Doc2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties): """Word2Vec model that creates vector representations of words in a text corpus. @@ -294,6 +297,8 @@ class Doc2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties): """ name = "Doc2VecModel" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + vectorSize = Param(Params._dummy(), "vectorSize", "the dimension of codes after transforming from words (> 0)", diff --git a/python/sparknlp/annotator/embeddings/elmo_embeddings.py b/python/sparknlp/annotator/embeddings/elmo_embeddings.py index fb614984e8da..905e0bc15309 100755 --- a/python/sparknlp/annotator/embeddings/elmo_embeddings.py +++ b/python/sparknlp/annotator/embeddings/elmo_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for ElmoEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class ElmoEmbeddings(AnnotatorModel, @@ -139,6 +140,8 @@ class ElmoEmbeddings(AnnotatorModel, name = "ElmoEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + batchSize = Param(Params._dummy(), "batchSize", "Batch size. Large values allows faster processing but requires more memory.", diff --git a/python/sparknlp/annotator/embeddings/longformer_embeddings.py b/python/sparknlp/annotator/embeddings/longformer_embeddings.py index c45d52e14bac..eff0d88105d9 100755 --- a/python/sparknlp/annotator/embeddings/longformer_embeddings.py +++ b/python/sparknlp/annotator/embeddings/longformer_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for LongformerEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class LongformerEmbeddings(AnnotatorModel, @@ -135,6 +136,8 @@ class LongformerEmbeddings(AnnotatorModel, """ name = "LongformerEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/embeddings/roberta_embeddings.py b/python/sparknlp/annotator/embeddings/roberta_embeddings.py index 803f9022f6ca..ddceeea8bb57 100755 --- a/python/sparknlp/annotator/embeddings/roberta_embeddings.py +++ b/python/sparknlp/annotator/embeddings/roberta_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for RoBertaEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class RoBertaEmbeddings(AnnotatorModel, @@ -147,6 +148,8 @@ class RoBertaEmbeddings(AnnotatorModel, name = "RoBertaEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py b/python/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py index 7086b787d89f..1de8741111ff 100755 --- a/python/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +++ b/python/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for RoBertaSentenceEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class RoBertaSentenceEmbeddings(AnnotatorModel, @@ -115,6 +116,8 @@ class RoBertaSentenceEmbeddings(AnnotatorModel, name = "RoBertaSentenceEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/embeddings/sentence_embeddings.py b/python/sparknlp/annotator/embeddings/sentence_embeddings.py index 612f8346244e..0592da083d45 100755 --- a/python/sparknlp/annotator/embeddings/sentence_embeddings.py +++ b/python/sparknlp/annotator/embeddings/sentence_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for SentenceEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class SentenceEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasStorageRef): @@ -94,6 +95,8 @@ class SentenceEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasStorageRef) name = "SentenceEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.WORD_EMBEDDINGS] + @keyword_only def __init__(self): super(SentenceEmbeddings, self).__init__(classname="com.johnsnowlabs.nlp.embeddings.SentenceEmbeddings") diff --git a/python/sparknlp/annotator/embeddings/universal_sentence_encoder.py b/python/sparknlp/annotator/embeddings/universal_sentence_encoder.py index a7fd45e5b041..608aebf58de9 100755 --- a/python/sparknlp/annotator/embeddings/universal_sentence_encoder.py +++ b/python/sparknlp/annotator/embeddings/universal_sentence_encoder.py @@ -14,6 +14,7 @@ """Contains classes for the UniversalSentenceEncoder.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class UniversalSentenceEncoder(AnnotatorModel, @@ -120,6 +121,8 @@ class UniversalSentenceEncoder(AnnotatorModel, name = "UniversalSentenceEncoder" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + loadSP = Param(Params._dummy(), "loadSP", "Whether to load SentencePiece ops file which is required only by multi-lingual models. " "This is not changeable after it's set with a pretrained model nor it is compatible with Windows.", diff --git a/python/sparknlp/annotator/embeddings/word2vec.py b/python/sparknlp/annotator/embeddings/word2vec.py index 7b71d2f03770..bf34f7267a66 100755 --- a/python/sparknlp/annotator/embeddings/word2vec.py +++ b/python/sparknlp/annotator/embeddings/word2vec.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class Word2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingProperties): @@ -100,6 +101,7 @@ class Word2VecApproach(AnnotatorApproach, HasStorageRef, HasEnableCachingPropert >>> dataset = spark.read.text(path).toDF("text") >>> pipelineModel = pipeline.fit(dataset) """ + inputAnnotatorTypes = [AnnotatorType.TOKEN] vectorSize = Param(Params._dummy(), "vectorSize", @@ -213,6 +215,7 @@ def __init__(self): def _create_model(self, java_model): return Word2VecModel(java_model=java_model) + class Word2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties): """Word2Vec model that creates vector representations of words in a text corpus. @@ -295,6 +298,8 @@ class Word2VecModel(AnnotatorModel, HasStorageRef, HasEmbeddingsProperties): """ name = "Word2VecModel" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + vectorSize = Param(Params._dummy(), "vectorSize", "the dimension of codes after transforming from words (> 0)", diff --git a/python/sparknlp/annotator/embeddings/word_embeddings.py b/python/sparknlp/annotator/embeddings/word_embeddings.py index f0da319caa08..90eb006fc3d7 100755 --- a/python/sparknlp/annotator/embeddings/word_embeddings.py +++ b/python/sparknlp/annotator/embeddings/word_embeddings.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class WordEmbeddings(AnnotatorApproach, HasEmbeddingsProperties, HasStorage): @@ -114,6 +115,8 @@ class WordEmbeddings(AnnotatorApproach, HasEmbeddingsProperties, HasStorage): name = "WordEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + writeBufferSize = Param(Params._dummy(), "writeBufferSize", "buffer size limit before dumping to disk storage while writing", @@ -249,8 +252,11 @@ class WordEmbeddingsModel(AnnotatorModel, HasEmbeddingsProperties, HasStorageMod """ name = "WordEmbeddingsModel" + databases = ['EMBEDDINGS'] + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + readCacheSize = Param(Params._dummy(), "readCacheSize", "cache size for items retrieved from storage. Increase for performance but higher memory consumption", diff --git a/python/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py b/python/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py index e6b7b23f1536..5fb5fb8bf019 100755 --- a/python/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +++ b/python/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for XlmRoBertaEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class XlmRoBertaEmbeddings(AnnotatorModel, @@ -147,6 +148,8 @@ class XlmRoBertaEmbeddings(AnnotatorModel, name = "XlmRoBertaEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py b/python/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py index ef97939219f4..62a04f16da47 100755 --- a/python/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +++ b/python/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for XlmRoBertaSentenceEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class XlmRoBertaSentenceEmbeddings(AnnotatorModel, @@ -118,6 +119,8 @@ class XlmRoBertaSentenceEmbeddings(AnnotatorModel, name = "XlmRoBertaSentenceEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + maxSentenceLength = Param(Params._dummy(), "maxSentenceLength", "Max sentence length to process", diff --git a/python/sparknlp/annotator/embeddings/xlnet_embeddings.py b/python/sparknlp/annotator/embeddings/xlnet_embeddings.py index 4c640d20d4da..2db38d33ab11 100755 --- a/python/sparknlp/annotator/embeddings/xlnet_embeddings.py +++ b/python/sparknlp/annotator/embeddings/xlnet_embeddings.py @@ -14,6 +14,7 @@ """Contains classes for XlnetEmbeddings.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class XlnetEmbeddings(AnnotatorModel, @@ -151,6 +152,8 @@ class XlnetEmbeddings(AnnotatorModel, name = "XlnetEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", diff --git a/python/sparknlp/annotator/er/entity_ruler.py b/python/sparknlp/annotator/er/entity_ruler.py index 2280aac8764d..ecd0bc1937dd 100755 --- a/python/sparknlp/annotator/er/entity_ruler.py +++ b/python/sparknlp/annotator/er/entity_ruler.py @@ -14,6 +14,7 @@ """Contains classes for the EntityRuler.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class EntityRulerApproach(AnnotatorApproach, HasStorage): @@ -125,6 +126,10 @@ class EntityRulerApproach(AnnotatorApproach, HasStorage): """ name = "EntityRulerApproach" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + + optionalInputAnnotatorTypes = [AnnotatorType.TOKEN] + patternsResource = Param(Params._dummy(), "patternsResource", "Resource in JSON or CSV format to map entities to patterns", @@ -225,6 +230,8 @@ class EntityRulerModel(AnnotatorModel, HasStorageModel): """ name = "EntityRulerModel" database = ['ENTITY_PATTERNS'] + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + optionalInputAnnotatorTypes = [AnnotatorType.TOKEN] def __init__(self, classname="com.johnsnowlabs.nlp.annotators.er.EntityRulerModel", java_model=None): super(EntityRulerModel, self).__init__( diff --git a/python/sparknlp/annotator/graph_extraction.py b/python/sparknlp/annotator/graph_extraction.py index 0d59c3dadc5a..99cd4dd4afb6 100755 --- a/python/sparknlp/annotator/graph_extraction.py +++ b/python/sparknlp/annotator/graph_extraction.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains classes for GraphExtraction.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class GraphExtraction(AnnotatorModel): @@ -142,6 +143,10 @@ class GraphExtraction(AnnotatorModel): """ name = "GraphExtraction" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.NAMED_ENTITY] + + optionalInputAnnotatorTypes = [AnnotatorType.DEPENDENCY, AnnotatorType.LABELED_DEPENDENCY] + relationshipTypes = Param(Params._dummy(), "relationshipTypes", "Find paths between a pair of token and entity", diff --git a/python/sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py b/python/sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py index dcdaa38999ca..bce92fd5c124 100755 --- a/python/sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py +++ b/python/sparknlp/annotator/keyword_extraction/yake_keyword_extraction.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class YakeKeywordExtraction(AnnotatorModel): @@ -152,6 +153,8 @@ class YakeKeywordExtraction(AnnotatorModel): """ name = "YakeKeywordExtraction" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + @keyword_only def __init__(self): super(YakeKeywordExtraction, self).__init__( diff --git a/python/sparknlp/annotator/ld_dl/language_detector_dl.py b/python/sparknlp/annotator/ld_dl/language_detector_dl.py index 8a999f8816f3..c3a8d88dad00 100755 --- a/python/sparknlp/annotator/ld_dl/language_detector_dl.py +++ b/python/sparknlp/annotator/ld_dl/language_detector_dl.py @@ -14,6 +14,7 @@ """Contains classes for LanguageDetectorDL.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class LanguageDetectorDL(AnnotatorModel, HasStorageRef, HasEngine): @@ -97,6 +98,8 @@ class LanguageDetectorDL(AnnotatorModel, HasStorageRef, HasEngine): """ name = "LanguageDetectorDL" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ld.dl.LanguageDetectorDL", java_model=None): super(LanguageDetectorDL, self).__init__( classname=classname, diff --git a/python/sparknlp/annotator/lemmatizer.py b/python/sparknlp/annotator/lemmatizer.py index cbd3a1bf551b..0f6fb05c5975 100755 --- a/python/sparknlp/annotator/lemmatizer.py +++ b/python/sparknlp/annotator/lemmatizer.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains classes for the Lemmatizer.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class Lemmatizer(AnnotatorApproach): @@ -86,6 +87,8 @@ class Lemmatizer(AnnotatorApproach): |[Peter, Pipers, employees, are, pick, peck, of, pickle, pepper, .]| +------------------------------------------------------------------+ """ + inputAnnotatorTypes = [AnnotatorType.TOKEN] + dictionary = Param(Params._dummy(), "dictionary", "lemmatizer external dictionary." + @@ -176,6 +179,7 @@ def setDictionary(self, path, key_delimiter, value_delimiter, read_as=ReadAs.TEX opts["valueDelimiter"] = value_delimiter return self._set(dictionary=ExternalResource(path, read_as, opts)) + class LemmatizerModel(AnnotatorModel): """Instantiated Model of the Lemmatizer. @@ -212,6 +216,8 @@ class LemmatizerModel(AnnotatorModel): """ name = "LemmatizerModel" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.LemmatizerModel", java_model=None): super(LemmatizerModel, self).__init__( classname=classname, diff --git a/python/sparknlp/annotator/matcher/big_text_matcher.py b/python/sparknlp/annotator/matcher/big_text_matcher.py index 706efb11923b..bc2246b96a79 100755 --- a/python/sparknlp/annotator/matcher/big_text_matcher.py +++ b/python/sparknlp/annotator/matcher/big_text_matcher.py @@ -15,6 +15,7 @@ from sparknlp.common import * from sparknlp.annotator.matcher.text_matcher import TextMatcherModel +from sparknlp.common.annotator_type import AnnotatorType class BigTextMatcher(AnnotatorApproach, HasStorage): @@ -82,6 +83,8 @@ class BigTextMatcher(AnnotatorApproach, HasStorage): +--------------------------------------------------------------------+ """ + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + entities = Param(Params._dummy(), "entities", "ExternalResource for entities", @@ -182,6 +185,7 @@ class BigTextMatcherModel(AnnotatorModel, HasStorageModel): """ name = "BigTextMatcherModel" databases = ['TMVOCAB', 'TMEDGES', 'TMNODES'] + inputAnnotatorTypes = [AnnotatorType.TOKEN] caseSensitive = Param(Params._dummy(), "caseSensitive", diff --git a/python/sparknlp/annotator/matcher/date_matcher.py b/python/sparknlp/annotator/matcher/date_matcher.py index c6aab2be3286..6992b6910343 100755 --- a/python/sparknlp/annotator/matcher/date_matcher.py +++ b/python/sparknlp/annotator/matcher/date_matcher.py @@ -14,6 +14,7 @@ """Contains classes for the DateMatcher.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class DateMatcherUtils(Params): @@ -159,6 +160,7 @@ def setAnchorDateDay(self, value): """ return self._set(anchorDateDay=value) + class DateMatcher(AnnotatorModel, DateMatcherUtils): """Matches standard date formats into a provided format Reads from different forms of date and time expressions and converts them @@ -251,6 +253,8 @@ class DateMatcher(AnnotatorModel, DateMatcherUtils): name = "DateMatcher" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + @keyword_only def __init__(self): super(DateMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.DateMatcher") diff --git a/python/sparknlp/annotator/matcher/multi_date_matcher.py b/python/sparknlp/annotator/matcher/multi_date_matcher.py index e4025242fbd7..2599cf0b0a90 100755 --- a/python/sparknlp/annotator/matcher/multi_date_matcher.py +++ b/python/sparknlp/annotator/matcher/multi_date_matcher.py @@ -15,6 +15,7 @@ from sparknlp.common import * from sparknlp.annotator.matcher.date_matcher import DateMatcherUtils +from sparknlp.common.annotator_type import AnnotatorType class MultiDateMatcher(AnnotatorModel, DateMatcherUtils): @@ -94,6 +95,8 @@ class MultiDateMatcher(AnnotatorModel, DateMatcherUtils): name = "MultiDateMatcher" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + @keyword_only def __init__(self): super(MultiDateMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.MultiDateMatcher") diff --git a/python/sparknlp/annotator/matcher/regex_matcher.py b/python/sparknlp/annotator/matcher/regex_matcher.py index dce628d458b7..2925e1bcc00f 100755 --- a/python/sparknlp/annotator/matcher/regex_matcher.py +++ b/python/sparknlp/annotator/matcher/regex_matcher.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class RegexMatcher(AnnotatorApproach): @@ -80,6 +81,8 @@ class RegexMatcher(AnnotatorApproach): +--------------------------------------------------------------------------------------------+ """ + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + strategy = Param(Params._dummy(), "strategy", "MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE", @@ -130,6 +133,7 @@ def setExternalRules(self, path, delimiter, read_as=ReadAs.TEXT, options={"forma def _create_model(self, java_model): return RegexMatcherModel(java_model=java_model) + class RegexMatcherModel(AnnotatorModel): """Instantiated model of the RegexMatcher. @@ -147,6 +151,8 @@ class RegexMatcherModel(AnnotatorModel): None """ + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RegexMatcherModel", java_model=None): super(RegexMatcherModel, self).__init__( classname=classname, diff --git a/python/sparknlp/annotator/matcher/text_matcher.py b/python/sparknlp/annotator/matcher/text_matcher.py index 8926b879f010..880405b070b6 100755 --- a/python/sparknlp/annotator/matcher/text_matcher.py +++ b/python/sparknlp/annotator/matcher/text_matcher.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class TextMatcher(AnnotatorApproach): @@ -90,6 +91,8 @@ class TextMatcher(AnnotatorApproach): BigTextMatcher : to match large amounts of text """ + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + entities = Param(Params._dummy(), "entities", "ExternalResource for entities", diff --git a/python/sparknlp/annotator/n_gram_generator.py b/python/sparknlp/annotator/n_gram_generator.py index 5bec6877882a..c92336c2b2bd 100755 --- a/python/sparknlp/annotator/n_gram_generator.py +++ b/python/sparknlp/annotator/n_gram_generator.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains classes for the NGramGenerator.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class NGramGenerator(AnnotatorModel): @@ -84,6 +85,8 @@ class NGramGenerator(AnnotatorModel): name = "NGramGenerator" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + @keyword_only def __init__(self): super(NGramGenerator, self).__init__(classname="com.johnsnowlabs.nlp.annotators.NGramGenerator") diff --git a/python/sparknlp/annotator/ner/ner_crf.py b/python/sparknlp/annotator/ner/ner_crf.py index bac1ac901728..afa6155e5128 100755 --- a/python/sparknlp/annotator/ner/ner_crf.py +++ b/python/sparknlp/annotator/ner/ner_crf.py @@ -15,6 +15,7 @@ from sparknlp.common import * from sparknlp.annotator.ner.ner_approach import NerApproach +from sparknlp.common.annotator_type import AnnotatorType class NerCrfApproach(AnnotatorApproach, NerApproach): @@ -130,6 +131,8 @@ class NerCrfApproach(AnnotatorApproach, NerApproach): NerConverter : to further process the results """ + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.WORD_EMBEDDINGS] + l2 = Param(Params._dummy(), "l2", "L2 regularization coefficient", TypeConverters.toFloat) c0 = Param(Params._dummy(), "c0", "c0 params defining decay speed for gradient", TypeConverters.toInt) @@ -248,6 +251,7 @@ def __init__(self): includeConfidence=False ) + class NerCrfModel(AnnotatorModel): """Extracts Named Entities based on a CRF Model. @@ -344,6 +348,8 @@ class NerCrfModel(AnnotatorModel): """ name = "NerCrfModel" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.POS, AnnotatorType.WORD_EMBEDDINGS] + includeConfidence = Param(Params._dummy(), "includeConfidence", "external features is a delimited text. needs 'delimiter' in options", TypeConverters.toBoolean) diff --git a/python/sparknlp/annotator/ner/ner_dl.py b/python/sparknlp/annotator/ner/ner_dl.py index 5280bbdb1f89..c2454b8a4442 100755 --- a/python/sparknlp/annotator/ner/ner_dl.py +++ b/python/sparknlp/annotator/ner/ner_dl.py @@ -18,6 +18,7 @@ from sparknlp.annotator.param import EvaluationDLParams from sparknlp.common import * from sparknlp.annotator.ner.ner_approach import NerApproach +from sparknlp.common.annotator_type import AnnotatorType class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams): @@ -158,6 +159,8 @@ class NerDLApproach(AnnotatorApproach, NerApproach, EvaluationDLParams): NerConverter : to further process the results """ + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.WORD_EMBEDDINGS] + lr = Param(Params._dummy(), "lr", "Learning Rate", TypeConverters.toFloat) po = Param(Params._dummy(), "po", "Learning rate decay coefficient. Real Learning Rage = lr / (1 + po * epoch)", @@ -467,6 +470,8 @@ class NerDLModel(AnnotatorModel, HasStorageRef, HasBatchedAnnotate, HasEngine): """ name = "NerDLModel" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN, AnnotatorType.WORD_EMBEDDINGS] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel", java_model=None): super(NerDLModel, self).__init__( classname=classname, diff --git a/python/sparknlp/annotator/ner/ner_overwriter.py b/python/sparknlp/annotator/ner/ner_overwriter.py index e75c70542ed0..ccb4bb581e96 100755 --- a/python/sparknlp/annotator/ner/ner_overwriter.py +++ b/python/sparknlp/annotator/ner/ner_overwriter.py @@ -14,6 +14,7 @@ """Contains classes for the NerOverwriter.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class NerOverwriter(AnnotatorModel): @@ -115,6 +116,8 @@ class NerOverwriter(AnnotatorModel): """ name = "NerOverwriter" + inputAnnotatorTypes = [AnnotatorType.NAMED_ENTITY] + @keyword_only def __init__(self): super(NerOverwriter, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.NerOverwriter") diff --git a/python/sparknlp/annotator/normalizer.py b/python/sparknlp/annotator/normalizer.py index e419d5f1406d..94f80fa154b8 100755 --- a/python/sparknlp/annotator/normalizer.py +++ b/python/sparknlp/annotator/normalizer.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains classes for the Normalizer.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class Normalizer(AnnotatorApproach): @@ -78,6 +79,7 @@ class Normalizer(AnnotatorApproach): |[john, and, peter, are, brothers, however, they, dont, support, each, other, that, much]| +----------------------------------------------------------------------------------------+ """ + inputAnnotatorTypes = [AnnotatorType.TOKEN] cleanupPatterns = Param(Params._dummy(), "cleanupPatterns", @@ -201,6 +203,7 @@ class NormalizerModel(AnnotatorModel): lowercase whether to convert strings to lowercase """ + inputAnnotatorTypes = [AnnotatorType.TOKEN] cleanupPatterns = Param(Params._dummy(), "cleanupPatterns", diff --git a/python/sparknlp/annotator/pos/perceptron.py b/python/sparknlp/annotator/pos/perceptron.py index 0029148f1f23..2be89b0ccf33 100755 --- a/python/sparknlp/annotator/pos/perceptron.py +++ b/python/sparknlp/annotator/pos/perceptron.py @@ -14,6 +14,7 @@ """Contains classes for the Perceptron Annotator.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class PerceptronApproach(AnnotatorApproach): @@ -99,6 +100,9 @@ class PerceptronApproach(AnnotatorApproach): |[NNP, NNP, CD, JJ, NNP, NNP, ,, MD, VB, DT, CD, .]| +--------------------------------------------------+ """ + + inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT] + posCol = Param(Params._dummy(), "posCol", "column of Array of POS tags that match tokens", @@ -150,6 +154,7 @@ def getNIterations(self): def _create_model(self, java_model): return PerceptronModel(java_model=java_model) + class PerceptronModel(AnnotatorModel): """Averaged Perceptron model to tag words part-of-speech. Sets a POS tag to each word within a sentence. @@ -224,6 +229,8 @@ class PerceptronModel(AnnotatorModel): """ name = "PerceptronModel" + inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronModel", java_model=None): super(PerceptronModel, self).__init__( classname=classname, diff --git a/python/sparknlp/annotator/sentence/sentence_detector.py b/python/sparknlp/annotator/sentence/sentence_detector.py index 55befabb0b7f..6bd62dc6b3a8 100755 --- a/python/sparknlp/annotator/sentence/sentence_detector.py +++ b/python/sparknlp/annotator/sentence/sentence_detector.py @@ -14,6 +14,7 @@ """Contains classes for the SentenceDetector.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class SentenceDetectorParams: @@ -162,6 +163,8 @@ class SentenceDetector(AnnotatorModel, SentenceDetectorParams): name = 'SentenceDetector' + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + # this one is exclusive to this detector detectLists = Param(Params._dummy(), "detectLists", @@ -284,3 +287,6 @@ def __init__(self): minLength=0, maxLength=99999 ) + + def getInputAnnotatorTypes(self): + return self.inputAnnotatorTypes \ No newline at end of file diff --git a/python/sparknlp/annotator/sentence/sentence_detector_dl.py b/python/sparknlp/annotator/sentence/sentence_detector_dl.py index b3316ec36bbf..bb1e0efc4ec2 100755 --- a/python/sparknlp/annotator/sentence/sentence_detector_dl.py +++ b/python/sparknlp/annotator/sentence/sentence_detector_dl.py @@ -14,6 +14,7 @@ """Contains classes for SentenceDetectorDl.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class SentenceDetectorDLApproach(AnnotatorApproach): @@ -94,6 +95,8 @@ class SentenceDetectorDLApproach(AnnotatorApproach): name = "SentenceDetectorDLApproach" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + modelArchitecture = Param(Params._dummy(), "modelArchitecture", "Model architecture (CNN)", @@ -298,6 +301,8 @@ class SentenceDetectorDLModel(AnnotatorModel, HasEngine): """ name = "SentenceDetectorDLModel" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + modelArchitecture = Param(Params._dummy(), "modelArchitecture", "Model architecture (CNN)", typeConverter=TypeConverters.toString) diff --git a/python/sparknlp/annotator/sentiment/sentiment_detector.py b/python/sparknlp/annotator/sentiment/sentiment_detector.py index 0d0171451042..a38a33319273 100755 --- a/python/sparknlp/annotator/sentiment/sentiment_detector.py +++ b/python/sparknlp/annotator/sentiment/sentiment_detector.py @@ -14,6 +14,7 @@ """Contains classes for the SentimentDetector.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class SentimentDetector(AnnotatorApproach): @@ -97,6 +98,9 @@ class SentimentDetector(AnnotatorApproach): -------- ViveknSentimentApproach : for an alternative approach to sentiment extraction """ + + inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT] + dictionary = Param(Params._dummy(), "dictionary", "path for dictionary to sentiment analysis", @@ -160,6 +164,7 @@ def setDictionary(self, path, delimiter, read_as=ReadAs.TEXT, options={'format': def _create_model(self, java_model): return SentimentDetectorModel(java_model=java_model) + class SentimentDetectorModel(AnnotatorModel): """Rule based sentiment detector, which calculates a score based on predefined keywords. @@ -185,6 +190,8 @@ class SentimentDetectorModel(AnnotatorModel): """ name = "SentimentDetectorModel" + inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT] + positiveMultiplier = Param(Params._dummy(), "positiveMultiplier", "multiplier for positive sentiments. Defaults 1.0", diff --git a/python/sparknlp/annotator/sentiment/vivekn_sentiment.py b/python/sparknlp/annotator/sentiment/vivekn_sentiment.py index 7178de018421..37aab25a2c11 100755 --- a/python/sparknlp/annotator/sentiment/vivekn_sentiment.py +++ b/python/sparknlp/annotator/sentiment/vivekn_sentiment.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class ViveknSentimentApproach(AnnotatorApproach): @@ -96,6 +97,9 @@ class ViveknSentimentApproach(AnnotatorApproach): |[negative] | +---------------+ """ + + inputAnnotatorTypes = [AnnotatorType.TOKEN, AnnotatorType.DOCUMENT] + sentimentCol = Param(Params._dummy(), "sentimentCol", "column with the sentiment result of every row. Must be 'positive' or 'negative'", diff --git a/python/sparknlp/annotator/seq2seq/gpt2_transformer.py b/python/sparknlp/annotator/seq2seq/gpt2_transformer.py index 998b4443af67..77d323b606d6 100755 --- a/python/sparknlp/annotator/seq2seq/gpt2_transformer.py +++ b/python/sparknlp/annotator/seq2seq/gpt2_transformer.py @@ -14,6 +14,7 @@ """Contains classes for the GPT2Transformer.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class GPT2Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): @@ -135,6 +136,8 @@ class GPT2Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): name = "GPT2Transformer" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + task = Param(Params._dummy(), "task", "Transformer's task, e.g. 'is it true that'>", typeConverter=TypeConverters.toString) diff --git a/python/sparknlp/annotator/seq2seq/marian_transformer.py b/python/sparknlp/annotator/seq2seq/marian_transformer.py index bc72feda848e..804ecb010bd1 100755 --- a/python/sparknlp/annotator/seq2seq/marian_transformer.py +++ b/python/sparknlp/annotator/seq2seq/marian_transformer.py @@ -14,6 +14,7 @@ """Contains classes for the MarianTransformer.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class MarianTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): @@ -120,6 +121,8 @@ class MarianTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): name = "MarianTransformer" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", diff --git a/python/sparknlp/annotator/seq2seq/t5_transformer.py b/python/sparknlp/annotator/seq2seq/t5_transformer.py index 7175d425dfa4..0a5d55fbbddb 100755 --- a/python/sparknlp/annotator/seq2seq/t5_transformer.py +++ b/python/sparknlp/annotator/seq2seq/t5_transformer.py @@ -14,6 +14,7 @@ """Contains classes for the T5Transformer.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class T5Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): @@ -148,6 +149,8 @@ class T5Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): name = "T5Transformer" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", diff --git a/python/sparknlp/annotator/spell_check/context_spell_checker.py b/python/sparknlp/annotator/spell_check/context_spell_checker.py index 2d8feb363949..82c0211bbadb 100755 --- a/python/sparknlp/annotator/spell_check/context_spell_checker.py +++ b/python/sparknlp/annotator/spell_check/context_spell_checker.py @@ -14,6 +14,7 @@ """Contains classes for the ContextSpellChecker.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class ContextSpellCheckerApproach(AnnotatorApproach): @@ -138,6 +139,8 @@ class ContextSpellCheckerApproach(AnnotatorApproach): name = "ContextSpellCheckerApproach" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + languageModelClasses = Param(Params._dummy(), "languageModelClasses", "Number of classes to use during factorization of the softmax output in the LM.", @@ -570,6 +573,8 @@ class ContextSpellCheckerModel(AnnotatorModel, HasEngine): """ name = "ContextSpellCheckerModel" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + wordMaxDistance = Param(Params._dummy(), "wordMaxDistance", "Maximum distance for the generated candidates for every word.", diff --git a/python/sparknlp/annotator/spell_check/norvig_sweeting.py b/python/sparknlp/annotator/spell_check/norvig_sweeting.py index c9047fc67530..b652ea2ec920 100755 --- a/python/sparknlp/annotator/spell_check/norvig_sweeting.py +++ b/python/sparknlp/annotator/spell_check/norvig_sweeting.py @@ -14,6 +14,7 @@ """Contains classes for the NorvigSweeting spell checker.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class NorvigSweetingApproach(AnnotatorApproach): @@ -114,6 +115,8 @@ class NorvigSweetingApproach(AnnotatorApproach): SymmetricDeleteApproach : for an alternative approach to spell checking ContextSpellCheckerApproach : for a DL based approach """ + inputAnnotatorTypes = [AnnotatorType.TOKEN] + dictionary = Param(Params._dummy(), "dictionary", "dictionary needs 'tokenPattern' regex in dictionary for separating words", @@ -242,6 +245,7 @@ def setFrequencyPriority(self, value): def _create_model(self, java_model): return NorvigSweetingModel(java_model=java_model) + class NorvigSweetingModel(AnnotatorModel): """This annotator retrieves tokens and makes corrections automatically if not found in an English dictionary. @@ -322,6 +326,8 @@ class NorvigSweetingModel(AnnotatorModel): """ name = "NorvigSweetingModel" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel", java_model=None): super(NorvigSweetingModel, self).__init__( classname=classname, diff --git a/python/sparknlp/annotator/spell_check/symmetric_delete.py b/python/sparknlp/annotator/spell_check/symmetric_delete.py index dd7628ed2dec..8ec2d7408118 100755 --- a/python/sparknlp/annotator/spell_check/symmetric_delete.py +++ b/python/sparknlp/annotator/spell_check/symmetric_delete.py @@ -14,6 +14,7 @@ """Contains classes for SymmetricDelete.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class SymmetricDeleteApproach(AnnotatorApproach): @@ -92,6 +93,8 @@ class SymmetricDeleteApproach(AnnotatorApproach): NorvigSweetingApproach : for an alternative approach to spell checking ContextSpellCheckerApproach : for a DL based approach """ + inputAnnotatorTypes = [AnnotatorType.TOKEN] + corpus = Param(Params._dummy(), "corpus", "folder or file with text that teaches about the language", @@ -188,6 +191,7 @@ def setDeletesThreshold(self, v): def _create_model(self, java_model): return SymmetricDeleteModel(java_model=java_model) + class SymmetricDeleteModel(AnnotatorModel): """Symmetric Delete spelling correction algorithm. @@ -259,6 +263,8 @@ class SymmetricDeleteModel(AnnotatorModel): """ name = "SymmetricDeleteModel" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel", java_model=None): super(SymmetricDeleteModel, self).__init__( diff --git a/python/sparknlp/annotator/stemmer.py b/python/sparknlp/annotator/stemmer.py index 1a27c8ae0fd6..1a8ad1a3d776 100755 --- a/python/sparknlp/annotator/stemmer.py +++ b/python/sparknlp/annotator/stemmer.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains classes for the Stemmer.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class Stemmer(AnnotatorModel): @@ -63,6 +64,8 @@ class Stemmer(AnnotatorModel): +-------------------------------------------------------------+ """ + inputAnnotatorTypes = [AnnotatorType.TOKEN] + language = Param(Params._dummy(), "language", "stemmer algorithm", typeConverter=TypeConverters.toString) name = "Stemmer" diff --git a/python/sparknlp/annotator/stop_words_cleaner.py b/python/sparknlp/annotator/stop_words_cleaner.py index 595a0a89d5b3..eacdfbc9339d 100755 --- a/python/sparknlp/annotator/stop_words_cleaner.py +++ b/python/sparknlp/annotator/stop_words_cleaner.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains classes for the StopWordsCleaner.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class StopWordsCleaner(AnnotatorModel): @@ -96,6 +97,8 @@ class StopWordsCleaner(AnnotatorModel): name = "StopWordsCleaner" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.StopWordsCleaner", java_model=None): super(StopWordsCleaner, self).__init__( diff --git a/python/sparknlp/annotator/token/chunk_tokenizer.py b/python/sparknlp/annotator/token/chunk_tokenizer.py index 96de0fb48b69..58a9ace6faf1 100755 --- a/python/sparknlp/annotator/token/chunk_tokenizer.py +++ b/python/sparknlp/annotator/token/chunk_tokenizer.py @@ -15,6 +15,7 @@ from sparknlp.common import * from sparknlp.annotator.token.tokenizer import Tokenizer, TokenizerModel +from sparknlp.common.annotator_type import AnnotatorType class ChunkTokenizer(Tokenizer): @@ -79,6 +80,8 @@ class ChunkTokenizer(Tokenizer): """ name = 'ChunkTokenizer' + inputAnnotatorTypes = [AnnotatorType.CHUNK] + @keyword_only def __init__(self): super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ChunkTokenizer") @@ -86,6 +89,7 @@ def __init__(self): def _create_model(self, java_model): return ChunkTokenizerModel(java_model=java_model) + class ChunkTokenizerModel(TokenizerModel): """Instantiated model of the ChunkTokenizer. @@ -104,6 +108,8 @@ class ChunkTokenizerModel(TokenizerModel): """ name = 'ChunkTokenizerModel' + inputAnnotatorTypes = [AnnotatorType.CHUNK] + @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.ChunkTokenizerModel", java_model=None): super(TokenizerModel, self).__init__( diff --git a/python/sparknlp/annotator/token/recursive_tokenizer.py b/python/sparknlp/annotator/token/recursive_tokenizer.py index f2ad15c069a7..1243782201b5 100755 --- a/python/sparknlp/annotator/token/recursive_tokenizer.py +++ b/python/sparknlp/annotator/token/recursive_tokenizer.py @@ -14,6 +14,7 @@ """Contains classes for the RecursiveTokenizer.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class RecursiveTokenizer(AnnotatorApproach): @@ -82,6 +83,8 @@ class RecursiveTokenizer(AnnotatorApproach): """ name = 'RecursiveTokenizer' + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + prefixes = Param(Params._dummy(), "prefixes", "strings to be considered independent tokens when found at the beginning of a word", @@ -171,6 +174,7 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizer def _create_model(self, java_model): return RecursiveTokenizerModel(java_model=java_model) + class RecursiveTokenizerModel(AnnotatorModel): """Instantiated model of the RecursiveTokenizer. @@ -189,6 +193,8 @@ class RecursiveTokenizerModel(AnnotatorModel): """ name = 'RecursiveTokenizerModel' + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RecursiveTokenizerModel", java_model=None): super(RecursiveTokenizerModel, self).__init__( classname=classname, diff --git a/python/sparknlp/annotator/token/regex_tokenizer.py b/python/sparknlp/annotator/token/regex_tokenizer.py index 2d190336eb98..125d2478f533 100755 --- a/python/sparknlp/annotator/token/regex_tokenizer.py +++ b/python/sparknlp/annotator/token/regex_tokenizer.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class RegexTokenizer(AnnotatorModel): @@ -81,6 +82,8 @@ class RegexTokenizer(AnnotatorModel): name = "RegexTokenizer" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + @keyword_only def __init__(self): super(RegexTokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.RegexTokenizer") diff --git a/python/sparknlp/annotator/token/token2_chunk.py b/python/sparknlp/annotator/token/token2_chunk.py index f740ae6c992e..c3535f6f32bd 100755 --- a/python/sparknlp/annotator/token/token2_chunk.py +++ b/python/sparknlp/annotator/token/token2_chunk.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class Token2Chunk(AnnotatorModel): @@ -67,6 +68,8 @@ class Token2Chunk(AnnotatorModel): """ name = "Token2Chunk" + inputAnnotatorTypes = [AnnotatorType.TOKEN] + def __init__(self): super(Token2Chunk, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Token2Chunk") diff --git a/python/sparknlp/annotator/token/tokenizer.py b/python/sparknlp/annotator/token/tokenizer.py index 74c24e299b65..f584526457a6 100755 --- a/python/sparknlp/annotator/token/tokenizer.py +++ b/python/sparknlp/annotator/token/tokenizer.py @@ -15,6 +15,7 @@ from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class Tokenizer(AnnotatorApproach): @@ -87,6 +88,10 @@ class Tokenizer(AnnotatorApproach): +-----------------------------------------------------------------------+ """ + name = 'Tokenizer' + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + targetPattern = Param(Params._dummy(), "targetPattern", "pattern to grab from text as token candidates. Defaults \S+", @@ -147,8 +152,6 @@ class Tokenizer(AnnotatorApproach): "Set the maximum allowed length for each token", typeConverter=TypeConverters.toInt) - name = 'Tokenizer' - @keyword_only def __init__(self): super(Tokenizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Tokenizer") @@ -429,6 +432,7 @@ def setMaxLength(self, value): def _create_model(self, java_model): return TokenizerModel(java_model=java_model) + class TokenizerModel(AnnotatorModel): """Tokenizes raw text into word pieces, tokens. Identifies tokens with tokenization open standards. A few rules will help customizing it if @@ -453,6 +457,8 @@ class TokenizerModel(AnnotatorModel): """ name = "TokenizerModel" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + exceptions = Param(Params._dummy(), "exceptions", "Words that won't be affected by tokenization rules", diff --git a/python/sparknlp/annotator/ws/word_segmenter.py b/python/sparknlp/annotator/ws/word_segmenter.py index 939767af8c44..ed7d62c135f9 100755 --- a/python/sparknlp/annotator/ws/word_segmenter.py +++ b/python/sparknlp/annotator/ws/word_segmenter.py @@ -14,6 +14,7 @@ """Contains classes for the WordSegmenter.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class WordSegmenterApproach(AnnotatorApproach): @@ -99,6 +100,8 @@ class WordSegmenterApproach(AnnotatorApproach): """ name = "WordSegmenterApproach" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + posCol = Param(Params._dummy(), "posCol", "column of Array of POS tags that match tokens", @@ -320,6 +323,8 @@ class WordSegmenterModel(AnnotatorModel): """ name = "WordSegmenterModel" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + enableRegexTokenizer = Param(Params._dummy(), "enableRegexTokenizer", "Whether to use RegexTokenizer before segmentation. Useful for multilingual text", diff --git a/python/sparknlp/base/chunk2_doc.py b/python/sparknlp/base/chunk2_doc.py index 9b50c2c18311..a12758c20adf 100644 --- a/python/sparknlp/base/chunk2_doc.py +++ b/python/sparknlp/base/chunk2_doc.py @@ -14,6 +14,8 @@ """Contains classes for Chunk2Doc.""" from pyspark import keyword_only + +from sparknlp.common.annotator_type import AnnotatorType from sparknlp.internal import AnnotatorTransformer from sparknlp.common import AnnotatorProperties @@ -72,6 +74,8 @@ class Chunk2Doc(AnnotatorTransformer, AnnotatorProperties): name = "Chunk2Doc" + inputAnnotatorTypes = [AnnotatorType.CHUNK] + @keyword_only def __init__(self): super(Chunk2Doc, self).__init__(classname="com.johnsnowlabs.nlp.Chunk2Doc") diff --git a/python/sparknlp/base/doc2_chunk.py b/python/sparknlp/base/doc2_chunk.py index 576e67613cfb..fe6f07692c35 100644 --- a/python/sparknlp/base/doc2_chunk.py +++ b/python/sparknlp/base/doc2_chunk.py @@ -15,6 +15,8 @@ from pyspark import keyword_only from pyspark.ml.param import TypeConverters, Params, Param + +from sparknlp.common.annotator_type import AnnotatorType from sparknlp.internal import AnnotatorTransformer from sparknlp.common import AnnotatorProperties @@ -84,6 +86,7 @@ class Doc2Chunk(AnnotatorTransformer, AnnotatorProperties): -------- Chunk2Doc : for converting `CHUNK` annotations to `DOCUMENT` """ + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] chunkCol = Param(Params._dummy(), "chunkCol", "column that contains string. Must be part of DOCUMENT", typeConverter=TypeConverters.toString) startCol = Param(Params._dummy(), "startCol", "column that has a reference of where chunk begins", typeConverter=TypeConverters.toString) diff --git a/python/sparknlp/base/table_assembler.py b/python/sparknlp/base/table_assembler.py index caf5469d3444..d04ce5529eac 100644 --- a/python/sparknlp/base/table_assembler.py +++ b/python/sparknlp/base/table_assembler.py @@ -13,6 +13,7 @@ # limitations under the License. """Contains classes for the TableAssembler.""" from sparknlp.common import * +from sparknlp.common.annotator_type import AnnotatorType class TableAssembler(AnnotatorModel): @@ -80,6 +81,8 @@ class TableAssembler(AnnotatorModel): """ name = "TableAssembler" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + inputFormat = Param( Params._dummy(), "inputFormat", diff --git a/python/sparknlp/base/token_assembler.py b/python/sparknlp/base/token_assembler.py index f9ae85118ee7..7b2db5187e00 100644 --- a/python/sparknlp/base/token_assembler.py +++ b/python/sparknlp/base/token_assembler.py @@ -15,6 +15,8 @@ from pyspark import keyword_only from pyspark.ml.param import TypeConverters, Params, Param + +from sparknlp.common.annotator_type import AnnotatorType from sparknlp.internal import AnnotatorTransformer from sparknlp.common import AnnotatorProperties @@ -94,6 +96,9 @@ class TokenAssembler(AnnotatorTransformer, AnnotatorProperties): """ name = "TokenAssembler" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + preservePosition = Param(Params._dummy(), "preservePosition", "whether to preserve the actual position of the tokens or reduce them to one space", typeConverter=TypeConverters.toBoolean) @keyword_only diff --git a/python/sparknlp/common/__init__.py b/python/sparknlp/common/__init__.py index 2a9d96975c1a..8a83c082e825 100644 --- a/python/sparknlp/common/__init__.py +++ b/python/sparknlp/common/__init__.py @@ -21,3 +21,4 @@ from sparknlp.common.recursive_annotator_approach import * from sparknlp.common.storage import * from sparknlp.common.utils import * +from sparknlp.common.annotator_type import * diff --git a/python/sparknlp/common/annotator_properties.py b/python/sparknlp/common/annotator_properties.py index 09e61f9866ee..ca9f715743c4 100644 --- a/python/sparknlp/common/annotator_properties.py +++ b/python/sparknlp/common/annotator_properties.py @@ -17,14 +17,20 @@ class AnnotatorProperties(Params): + + inputAnnotatorTypes = [] + optionalInputAnnotatorTypes = [] + inputCols = Param(Params._dummy(), "inputCols", "previous annotations columns, if renamed", typeConverter=TypeConverters.toListString) + outputCol = Param(Params._dummy(), "outputCol", "output annotation column. can be left default.", typeConverter=TypeConverters.toString) + lazyAnnotator = Param(Params._dummy(), "lazyAnnotator", "Whether this AnnotatorModel acts as lazy in RecursivePipelines", @@ -39,10 +45,35 @@ def setInputCols(self, *value): *value : str Input columns for the annotator """ - if len(value) == 1 and type(value[0]) == list: - return self._set(inputCols=value[0]) + if type(value[0]) == str or type(value[0]) == list: + self.inputColsValidation(value) + if len(value) == 1 and type(value[0]) == list: + return self._set(inputCols=value[0]) + else: + return self._set(inputCols=list(value)) else: - return self._set(inputCols=list(value)) + raise TypeError("InputCols datatype not supported. It must be either str or list") + + def inputColsValidation(self, value): + actual_columns = len(value) + if type(value[0]) == list: + actual_columns = len(value[0]) + + expected_columns = len(self.inputAnnotatorTypes) + + if len(self.optionalInputAnnotatorTypes) == 0: + if actual_columns != expected_columns: + raise TypeError( + f"setInputCols in {self.uid} expecting {expected_columns} columns. " + f"Provided column amount: {actual_columns}. " + f"Which should be columns from the following annotators: {self.inputAnnotatorTypes}") + else: + expected_columns = expected_columns + len(self.optionalInputAnnotatorTypes) + if not (actual_columns == len(self.inputAnnotatorTypes) or actual_columns == expected_columns): + raise TypeError( + f"setInputCols in {self.uid} expecting at least {len(self.inputAnnotatorTypes)} columns. " + f"Provided column amount: {actual_columns}. " + f"Which should be columns from at least the following annotators: {self.inputAnnotatorTypes}") def getInputCols(self): """Gets current column names of input annotations.""" @@ -79,4 +110,3 @@ def getLazyAnnotator(self): RecursivePipeline. """ return self.getOrDefault(self.lazyAnnotator) - diff --git a/python/sparknlp/common/annotator_type.py b/python/sparknlp/common/annotator_type.py new file mode 100644 index 000000000000..2d0eb1ed54c9 --- /dev/null +++ b/python/sparknlp/common/annotator_type.py @@ -0,0 +1,37 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class AnnotatorType(object): + AUDIO = "audio" + DOCUMENT = "document" + IMAGE = "image" + TOKEN = "token" + WORDPIECE = "wordpiece" + WORD_EMBEDDINGS = "word_embeddings" + SENTENCE_EMBEDDINGS = "sentence_embeddings" + CATEGORY = "category" + DATE = "date" + ENTITY = "entity" + SENTIMENT = "sentiment" + POS = "pos" + CHUNK = "chunk" + NAMED_ENTITY = "named_entity" + NEGEX = "negex" + DEPENDENCY = "dependency" + LABELED_DEPENDENCY = "labeled_dependency" + LANGUAGE = "language" + NODE = "node" + TABLE = "table" + DUMMY = "dummy" diff --git a/python/test/annotator/er/entity_ruler_test.py b/python/test/annotator/er/entity_ruler_test.py index 20f2cf2b254d..f371f38d1075 100644 --- a/python/test/annotator/er/entity_ruler_test.py +++ b/python/test/annotator/er/entity_ruler_test.py @@ -40,5 +40,28 @@ def runTest(self): pipeline = Pipeline(stages=[document_assembler, tokenizer, entity_ruler]) model = pipeline.fit(self.data) result = model.transform(self.data) - assert result.select("entity").count() > 0 + self.assertTrue(result.select("entity").count() > 0) + + +@pytest.mark.fast +class EntityRulerOneColumnTestSpec(unittest.TestCase): + + def setUp(self): + self.data = SparkContextForTest.spark.createDataFrame([["John Snow lives in Winterfell"]]).toDF("text") + self.path = os.getcwd() + "/../src/test/resources/entity-ruler/keywords_only.json" + + def runTest(self): + document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document") + + entity_ruler = EntityRulerApproach() \ + .setInputCols("document") \ + .setOutputCol("entity") \ + .setPatternsResource(self.path) + + pipeline = Pipeline(stages=[document_assembler, entity_ruler]) + model = pipeline.fit(self.data) + result = model.transform(self.data) + self.assertTrue(result.select("entity").count() > 0) + + diff --git a/python/test/common/annotator_properties_test.py b/python/test/common/annotator_properties_test.py new file mode 100644 index 000000000000..d845bde0fe9c --- /dev/null +++ b/python/test/common/annotator_properties_test.py @@ -0,0 +1,77 @@ +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +@pytest.mark.fast +class AnnotatorPropertiesTest(unittest.TestCase): + + def setUp(self): + self.title = "Neopets - The gaming website for all your needs." + self.desc = "Peter Pipers employees are picking pecks of pickled peppers" + self.data = SparkContextForTest.spark \ + .createDataFrame([[self.title, self.desc]]).toDF("title", "description") + + def runTest(self): + + document_assembler_title = DocumentAssembler() \ + .setInputCol("title") \ + .setOutputCol("document_title") + + document_assembler_desc = DocumentAssembler() \ + .setInputCol("description") \ + .setOutputCol("document_desc") + + sentence_title = SentenceDetector() \ + .setInputCols(["document_title"]) \ + .setOutputCol("sentence_title") + + sentence_desc = SentenceDetector() \ + .setInputCols("document_desc") \ + .setOutputCol("sentence_desc") + + pipeline = Pipeline(stages=[ + document_assembler_title, + document_assembler_desc, + sentence_title, + sentence_desc, + ]) + + result_df = pipeline.fit(self.data).transform(self.data) + + sentence_title_output = result_df.select("sentence_title.result").collect()[0][0][0] + self.assertEqual(sentence_title_output, self.title) + + sentence_desc_output = result_df.select("sentence_desc.result").collect()[0][0][0] + self.assertEqual(sentence_desc_output, self.desc) + +@pytest.mark.fast +class AnnotatorPropertiesTestInvalidNumberOfColumns(unittest.TestCase): + + def setUp(self): + self.array_input = ["document_desc", "document_title"] + self.tuple_input = ("document_desc", "document_title") + + def runTest(self): + + self.assertRaises(TypeError, SentenceDetector().setInputCols, self.array_input) + + self.assertRaises(TypeError, SentenceDetector().setInputCols, self.tuple_input) + + +@pytest.mark.fast +class AnnotatorPropertiesTestInvalidInput(unittest.TestCase): + + def setUp(self): + self.dict_input = {"document_desc": "document_title"} + self.int_input = 4 + + def runTest(self): + + self.assertRaises(TypeError, SentenceDetector().setInputCols, self.dict_input) + + self.assertRaises(TypeError, SentenceDetector().setInputCols, self.int_input)