Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPNetForSequenceClassification Integration #250

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Empty file.
15 changes: 15 additions & 0 deletions nlu/components/classifiers/seq_mpnet/seq_mpnet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from sparknlp.annotator import MPNetForSequenceClassification


class SeqMPNetClassifier:
@staticmethod
def get_default_model():
return MPNetForSequenceClassification.pretrained() \
.setInputCols(["token", "sentence"]) \
.setOutputCol("category")

@staticmethod
def get_pretrained_model(name, language, bucket=None):
return MPNetForSequenceClassification.pretrained(name, language, bucket) \
.setInputCols(["token", "sentence"]) \
.setOutputCol("category")
2 changes: 2 additions & 0 deletions nlu/spellbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -3560,6 +3560,7 @@ class Spellbook:
'en.classify.ag_news.longformer': 'longformer_base_sequence_classifier_ag_news',
'en.classify.ag_news.xlnet': 'xlnet_base_sequence_classifier_ag_news',
'en.classify.albert.ag_news': 'albert_base_sequence_classifier_ag_news',
'en.classify.mpnet.ukr_message': 'mpnet_sequence_classifier_ukr_message',
'en.classify.albert.imdb': 'albert_base_sequence_classifier_imdb',
'en.classify.bbc.roberta.by_abhishek': 'roberta_classifier_autonlp_bbc_37249301',
'en.classify.bert': 'bert_sequence_classifier_antisemitism',
Expand Down Expand Up @@ -16632,6 +16633,7 @@ class Spellbook:
'mpnet_retriever_squad2': 'MPNetEmbeddings',
'mpnet_snli': 'MPNetEmbeddings',
'mpnet_snli_negatives': 'MPNetEmbeddings',
'mpnet_sequence_classifier_ukr_message': 'MPNetForSequenceClassification',
'multiclassifierdl_hoc': 'MultiClassifierDLModel',
'multiclassifierdl_use_e2e': 'MultiClassifierDLModel',
'multiclassifierdl_use_toxic': 'MultiClassifierDLModel',
Expand Down
1 change: 1 addition & 0 deletions nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ class AnnoClassRef:
A_N.LONGFORMER_FOR_TOKEN_CLASSIFICATION: 'LongformerForTokenClassification',
A_N.MARIAN_TRANSFORMER: 'MarianTransformer',
A_N.MPNET_SENTENCE_EMBEDDINGS: 'MPNetEmbeddings',
A_N.MPNET_FOR_SEQUENCE_CLASSIFICATION: 'MPNetForSequenceClassification',
A_N.ROBERTA_EMBEDDINGS: 'RoBertaEmbeddings',
A_N.ROBERTA_FOR_TOKEN_CLASSIFICATION: 'RoBertaForTokenClassification',
A_N.ROBERTA_SENTENCE_EMBEDDINGS: 'RoBertaSentenceEmbeddings',
Expand Down
21 changes: 20 additions & 1 deletion nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from nlu.components.classifiers.seq_roberta.seq_roberta import SeqRobertaClassifier
from nlu.components.classifiers.seq_xlm_roberta.seq_xlm_roberta import SeqXlmRobertaClassifier
from nlu.components.classifiers.seq_xlnet.seq_xlnet import SeqXlnetClassifier
from nlu.components.classifiers.seq_mpnet.seq_mpnet import SeqMPNetClassifier
from nlu.components.classifiers.span_bert.span_bert import SpanBertClassifier
from nlu.components.classifiers.span_camembert.span_camembert import SpanCamemBert
from nlu.components.classifiers.span_deberta.span_deberta import SpanDeBertaClassifier
Expand Down Expand Up @@ -2640,6 +2641,25 @@ class ComponentUniverse:
has_storage_ref=True,
is_storage_ref_producer=True,
),
A.MPNET_FOR_SEQUENCE_CLASSIFICATION: partial(NluComponent,
name=A.MPNET_FOR_SEQUENCE_CLASSIFICATION,
type=T.TRANSFORMER_SEQUENCE_CLASSIFIER,
get_default_model=SeqMPNetClassifier.get_default_model,
get_pretrained_model=SeqMPNetClassifier.get_pretrained_model,
pdf_extractor_methods={'default': default_classifier_dl_config,
'default_full': default_full_config, },
pdf_col_name_substitutor=substitute_seq_bert_classifier_cols,
output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER,
node=NLP_FEATURE_NODES.nodes[A.MPNET_FOR_SEQUENCE_CLASSIFICATION],
description='MPNetForSequenceClassification can load MPNet Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks.',
provider=ComponentBackends.open_source,
license=Licenses.open_source,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=A.MPNET_FOR_SEQUENCE_CLASSIFICATION,
jsl_anno_py_class=ACR.JSL_anno2_py_class[
A.MPNET_FOR_SEQUENCE_CLASSIFICATION],
),
A.ROBERTA_EMBEDDINGS: partial(NluComponent,
name=A.ROBERTA_EMBEDDINGS,
type=T.TOKEN_EMBEDDING,
Expand All @@ -2660,7 +2680,6 @@ class ComponentUniverse:
has_storage_ref=True,
is_storage_ref_producer=True,
),

A.ROBERTA_FOR_TOKEN_CLASSIFICATION: partial(NluComponent,
name=A.ROBERTA_FOR_TOKEN_CLASSIFICATION,
type=T.TRANSFORMER_SEQUENCE_CLASSIFIER,
Expand Down
7 changes: 1 addition & 6 deletions nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,8 @@ class NLP_NODE_IDS:
SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification")
BART_TRANSFORMER = JslAnnoId("bart_transformer")
INSTRUCTOR_SENTENCE_EMBEDDINGS = JslAnnoId('instructor_sentence_embeddings')

MPNET_SENTENCE_EMBEDDINGS = JslAnnoId('mpnet_sentence_embeddings')





MPNET_FOR_SEQUENCE_CLASSIFICATION = JslAnnoId('mpnet_for_sequence_classification')
DISTIL_BERT_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('distil_bert_zero_shot')
BERT_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('bert_zero_shot')
ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION = JslAnnoId('roberta_zero_shot')
Expand Down
4 changes: 4 additions & 0 deletions nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ class NLP_FEATURE_NODES: # or Mode Node?
A.E5_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.E5_SENTENCE_EMBEDDINGS, [F.DOCUMENT],[F.SENTENCE_EMBEDDINGS]),
A.MPNET_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.MPNET_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),

A.MPNET_FOR_SEQUENCE_CLASSIFICATION: NlpFeatureNode(A.MPNET_FOR_SEQUENCE_CLASSIFICATION,
[F.DOCUMENT, F.TOKEN],
[F.SEQUENCE_CLASSIFICATION]),

A.PARTIALLY_IMPLEMENTED: NlpFeatureNode(A.PARTIALLY_IMPLEMENTED, [F.UNKOWN], [F.UNKOWN]),

A.COREF_SPAN_BERT: NlpFeatureNode(A.COREF_SPAN_BERT, [F.DOCUMENT, F.TOKEN], [F.COREF_TOKEN]),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import unittest
import nlu

class TestSeqMPNetClassifier(unittest.TestCase):

def test_mpnet_sequence_classifier(self):
# Load the specific NLU pipeline for sequence classification
pipe = nlu.load("en.classify.mpnet._ukr_message")

# New data points to classify
data = [
"I love driving my car.",
"The next bus will arrive in 20 minutes.",
"Pineapple on pizza is the worst 🤮"
]

# Predict the classification for each data point
df = pipe.predict(data, output_level="document")

# Print each column of the dataframe to inspect the prediction results
for c in df.columns:
print(df[c])

if __name__ == "__main__":
unittest.main()



Loading