JohnSnowLabs · C-K-Loan · Jul 13, 2024 · Feb 29, 2024 · May 18, 2024
diff --git a/examples/colab/component_examples/classifiers/NLU_MPNetForSequenceClassification.ipynb b/examples/colab/component_examples/classifiers/NLU_MPNetForSequenceClassification.ipynb
diff --git a/nlu/components/classifiers/seq_mpnet/__init__.py b/nlu/components/classifiers/seq_mpnet/__init__.py
diff --git a/nlu/components/classifiers/seq_mpnet/seq_mpnet.py b/nlu/components/classifiers/seq_mpnet/seq_mpnet.py
@@ -0,0 +1,15 @@
+from sparknlp.annotator import MPNetForSequenceClassification
+
+
+class SeqMPNetClassifier:
+    @staticmethod
+    def get_default_model():
+        return MPNetForSequenceClassification.pretrained() \
+            .setInputCols(["token", "sentence"]) \
+            .setOutputCol("category")
+
+    @staticmethod
+    def get_pretrained_model(name, language, bucket=None):
+        return MPNetForSequenceClassification.pretrained(name, language, bucket) \
+            .setInputCols(["token", "sentence"]) \
+            .setOutputCol("category")
diff --git a/nlu/spellbook.py b/nlu/spellbook.py
@@ -3560,6 +3560,7 @@ class Spellbook:
             'en.classify.ag_news.longformer': 'longformer_base_sequence_classifier_ag_news',
             'en.classify.ag_news.xlnet': 'xlnet_base_sequence_classifier_ag_news',
             'en.classify.albert.ag_news': 'albert_base_sequence_classifier_ag_news',
+            'en.classify.mpnet.ukr_message': 'mpnet_sequence_classifier_ukr_message',
             'en.classify.albert.imdb': 'albert_base_sequence_classifier_imdb',
             'en.classify.bbc.roberta.by_abhishek': 'roberta_classifier_autonlp_bbc_37249301',
             'en.classify.bert': 'bert_sequence_classifier_antisemitism',
@@ -16632,6 +16633,7 @@ class Spellbook:
                              'mpnet_retriever_squad2': 'MPNetEmbeddings',
                              'mpnet_snli': 'MPNetEmbeddings',
                              'mpnet_snli_negatives': 'MPNetEmbeddings',
+                             'mpnet_sequence_classifier_ukr_message': 'MPNetForSequenceClassification',
                              'multiclassifierdl_hoc': 'MultiClassifierDLModel',
                              'multiclassifierdl_use_e2e': 'MultiClassifierDLModel',
                              'multiclassifierdl_use_toxic': 'MultiClassifierDLModel',

diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -109,6 +109,7 @@ class AnnoClassRef:
         A_N.LONGFORMER_FOR_TOKEN_CLASSIFICATION: 'LongformerForTokenClassification',
         A_N.MARIAN_TRANSFORMER: 'MarianTransformer',
         A_N.MPNET_SENTENCE_EMBEDDINGS: 'MPNetEmbeddings',
+        A_N.MPNET_FOR_SEQUENCE_CLASSIFICATION: 'MPNetForSequenceClassification',
         A_N.ROBERTA_EMBEDDINGS: 'RoBertaEmbeddings',
         A_N.ROBERTA_FOR_TOKEN_CLASSIFICATION: 'RoBertaForTokenClassification',
         A_N.ROBERTA_SENTENCE_EMBEDDINGS: 'RoBertaSentenceEmbeddings',

diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py
@@ -38,6 +38,7 @@
 from nlu.components.classifiers.seq_roberta.seq_roberta import SeqRobertaClassifier
 from nlu.components.classifiers.seq_xlm_roberta.seq_xlm_roberta import SeqXlmRobertaClassifier
 from nlu.components.classifiers.seq_xlnet.seq_xlnet import SeqXlnetClassifier
+from nlu.components.classifiers.seq_mpnet.seq_mpnet import SeqMPNetClassifier 
 from nlu.components.classifiers.span_bert.span_bert import SpanBertClassifier
 from nlu.components.classifiers.span_camembert.span_camembert import SpanCamemBert
 from nlu.components.classifiers.span_deberta.span_deberta import SpanDeBertaClassifier
@@ -2640,6 +2641,25 @@ class ComponentUniverse:
                                             has_storage_ref=True,
                                             is_storage_ref_producer=True,
                                             ),
+        A.MPNET_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent,
+                                            name=A.MPNET_FOR_SEQUENCE_CLASSIFICATION,
+                                            type=T.TRANSFORMER_SEQUENCE_CLASSIFIER,
+                                            get_default_model=SeqMPNetClassifier.get_default_model,
+                                            get_pretrained_model=SeqMPNetClassifier.get_pretrained_model,
+                                            pdf_extractor_methods={'default': default_classifier_dl_config,
+                                                                   'default_full': default_full_config, },
+                                            pdf_col_name_substitutor=substitute_seq_bert_classifier_cols,
+                                            output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER,
+                                            node=NLP_FEATURE_NODES.nodes[A.MPNET_FOR_SEQUENCE_CLASSIFICATION],
+                                            description='MPNetForSequenceClassification can load MPNet Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.',
+                                            provider=ComponentBackends.open_source,
+                                            license=Licenses.open_source,
+                                            computation_context=ComputeContexts.spark,
+                                            output_context=ComputeContexts.spark,
+                                            jsl_anno_class_id=A.MPNET_FOR_SEQUENCE_CLASSIFICATION,
+                                            jsl_anno_py_class=ACR.JSL_anno2_py_class[
+                                            A.MPNET_FOR_SEQUENCE_CLASSIFICATION],
+                                                     ),
         A.ROBERTA_EMBEDDINGS: partial(NluComponent,
                                       name=A.ROBERTA_EMBEDDINGS,
                                       type=T.TOKEN_EMBEDDING,
@@ -2660,7 +2680,6 @@ class ComponentUniverse:
                                       has_storage_ref=True,
                                       is_storage_ref_producer=True,
                                       ),
-
         A.ROBERTA_FOR_TOKEN_CLASSIFICATION: partial(NluComponent,
                                                     name=A.ROBERTA_FOR_TOKEN_CLASSIFICATION,
                                                     type=T.TRANSFORMER_SEQUENCE_CLASSIFIER,

diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py
@@ -109,13 +109,8 @@ class NLP_NODE_IDS:
     SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification")
     BART_TRANSFORMER = JslAnnoId("bart_transformer")
     INSTRUCTOR_SENTENCE_EMBEDDINGS = JslAnnoId('instructor_sentence_embeddings')
-
     MPNET_SENTENCE_EMBEDDINGS = JslAnnoId('mpnet_sentence_embeddings')
-
-
-
-
-
+    MPNET_FOR_SEQUENCE_CLASSIFICATION = JslAnnoId('mpnet_for_sequence_classification')
     DISTIL_BERT_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('distil_bert_zero_shot')
     BERT_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('bert_zero_shot')
     ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('roberta_zero_shot')

diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py
@@ -78,6 +78,10 @@ class NLP_FEATURE_NODES:  # or Mode Node?
         A.E5_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.E5_SENTENCE_EMBEDDINGS, [F.DOCUMENT],[F.SENTENCE_EMBEDDINGS]),
         A.MPNET_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.MPNET_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),
 
+        A.MPNET_FOR_SEQUENCE_CLASSIFICATION: NlpFeatureNode(A.MPNET_FOR_SEQUENCE_CLASSIFICATION, 
+                                                            [F.DOCUMENT, F.TOKEN],
+                                                            [F.SEQUENCE_CLASSIFICATION]),
+
         A.PARTIALLY_IMPLEMENTED: NlpFeatureNode(A.PARTIALLY_IMPLEMENTED, [F.UNKOWN], [F.UNKOWN]),
 
         A.COREF_SPAN_BERT: NlpFeatureNode(A.COREF_SPAN_BERT, [F.DOCUMENT, F.TOKEN], [F.COREF_TOKEN]),

diff --git a/tests/nlu_core_tests/component_tests/classifier_tests/mpnet_sequence_tests.py b/tests/nlu_core_tests/component_tests/classifier_tests/mpnet_sequence_tests.py
@@ -0,0 +1,28 @@
+import unittest
+import nlu
+
+class TestSeqMPNetClassifier(unittest.TestCase):
+
+    def test_mpnet_sequence_classifier(self):
+        # Load the specific NLU pipeline for sequence classification
+        pipe = nlu.load("en.classify.mpnet._ukr_message")
+
+        # New data points to classify
+        data = [
+            "I love driving my car.",
+            "The next bus will arrive in 20 minutes.",
+            "Pineapple on pizza is the worst 🤮"
+        ]
+
+        # Predict the classification for each data point
+        df = pipe.predict(data, output_level="document")
+
+        # Print each column of the dataframe to inspect the prediction results
+        for c in df.columns:
+            print(df[c])
+
+if __name__ == "__main__":
+    unittest.main()
+
+
+