diff --git a/docs/en/annotators.md b/docs/en/annotators.md index a5199db492..53d5ef6fc7 100644 --- a/docs/en/annotators.md +++ b/docs/en/annotators.md @@ -69,8 +69,6 @@ There are two types of Annotators: {% include templates/anno_table_entry.md path="" name="ImageAssembler" summary="Prepares images read by Spark into a format that is processable by Spark NLP."%} {% include templates/anno_table_entry.md path="" name="LanguageDetectorDL" summary="Language Identification and Detection by using CNN and RNN architectures in TensorFlow."%} {% include templates/anno_table_entry.md path="" name="Lemmatizer" summary="Finds lemmas out of words with the objective of returning a base dictionary word."%} -{% include templates/licensed_table_entry.md name="LightDeIdentification" summary="Light version of DeIdentification."%} -{% include templates/licensed_table_entry.md name="MultiChunk2Doc" summary="Merges a given chunks to create a document."%} {% include templates/anno_table_entry.md path="" name="MultiClassifierDL" summary="Multi-label Text Classification."%} {% include templates/anno_table_entry.md path="" name="MultiDateMatcher" summary="Matches standard date formats into a provided format."%} {% include templates/anno_table_entry.md path="" name="MultiDocumentAssembler" summary="Prepares data into a format that is processable by Spark NLP."%} diff --git a/docs/en/licensed_annotator_entries/FewShotAssertionClassifierModel.md b/docs/en/licensed_annotator_entries/FewShotAssertionClassifierModel.md new file mode 100644 index 0000000000..edfdf0cbbe --- /dev/null +++ b/docs/en/licensed_annotator_entries/FewShotAssertionClassifierModel.md @@ -0,0 +1,182 @@ +{%- capture title -%} +FewShotAssertionClassifierModel +{%- endcapture -%} + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} + +FewShotAssertionClassifierModel does assertion classification using can run large (LLMS based) +few shot classifiers based on the SetFit approach. + +Parameters: + +- `batchSize` *(Int)*: Batch size + +- `caseSensitive` *(Bool)*: Whether the classifier is sensitive to text casing + +- `maxSentenceLength` *(Int)*: The maximum length of the input text + + +{%- endcapture -%} + +{%- capture model_input_anno -%} +DOCUMENT, CHUNK +{%- endcapture -%} + +{%- capture model_output_anno -%} +ASSERTION +{%- endcapture -%} + +{%- capture model_python_medical -%} + +from johnsnowlabs import nlp, medical + +document_assembler = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + + sentence_detector = nlp.SentenceDetector()\ + .setInputCol("document")\ + .setOutputCol("sentence") + + tokenizer = nlp.Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + + embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") \ + .setCaseSensitive(False) + + ner = medical.NerModel.pretrained("ner_jsl", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + + ner_converter = medical.NerConverterInternal()\ + .setInputCols(["sentence", "token", "ner"])\ + .setWhiteList("Disease_Syndrome_Disorder", "Hypertension")\ + .setOutputCol("ner_chunk") + + few_shot_assertion_classifier = medical.FewShotAssertionClassifierModel().pretrained()\ + .setInputCols(["sentence", "ner_chunk"])\ + .setOutputCol("assertion") + +data = spark.createDataFrame( + [["Includes hypertension and chronic obstructive pulmonary disease."]] + ).toDF("text") + +results = Pipeline() \ + .setStages([ + document_assembler, + sentence_detector, + tokenizer, + embeddings, + ner, + ner_converter, + few_shot_assertion_classifier]) \ + .fit(data) \ + .transform(data) \ + +results.selectExpr("assertion.result", "assertion.metadata.chunk", "assertion.metadata.confidence").show() + +## Result + ++--------+----------------------------+-----------+ +| result| chunk| confidence| ++--------+----------------------------+-----------+ +| present| hypertension| 1.0| +| absent| arteriovenous malformations| 1.0| ++--------+----------------------------+-----------+ + +{%- endcapture -%} + +{%- capture model_scala_medical -%} +import spark.implicits._ + +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentences") + +val tokenizer = Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") + +val embeddings = WordEmbeddingsModel + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(False) + +val ner = MedicalNerModel + .pretrained("ner_jsl", "en", "clinical/models") + .setInputCols(["sentence", "token", "embeddings"]) + .setOutputCol("ner") + +val nerConverter = NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setWhiteList("Disease_Syndrome_Disorder", "Hypertension") + .setOutputCol("ner_chunk") + +val fewShotAssertionClassifier = LargeFewShotClassifierModel + .pretrained("clinical_assertion") + .setInputCols(Array("sentence")) + .setBatchSize(1) + .setOutputCol("label") + +val pipeline = new Pipeline() + .setStages(Array( + documentAssembler, + sentenceDetector, + tokenizer, + embeddings, + ner, + nerConverter, + fewShotAssertionClassifier)) + +val model = pipeline.fit(Seq().toDS.toDF("text")) +val results = model.transform(Seq("Includes hypertension and chronic obstructive pulmonary disease.").toDS.toDF("text")) + +results.selectExpr("explode(assertion) as assertion") + .selectExpr("assertion.result", "assertion.metadata.chunk", "assertion.metadata.confidence") + .show(truncate = false) + +// Result + ++-------+-------------------------------------+----------+ +|result |chunk |confidence| ++-------+-------------------------------------+----------+ +|present|hypertension |1.0 | +|present|chronic obstructive pulmonary disease|1.0 | +|absent |arteriovenous malformations |1.0 | +|absent |vascular malformation |0.9999997 | ++-------+-------------------------------------+----------+ + +{%- endcapture -%} + +{%- capture model_api_link -%} +[FewShotAssertionClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/FewShotAssertionClassifierModel.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[FewShotAssertionClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/few_shot_assertion_classifier/index.html) +{%- endcapture -%} + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +%} diff --git a/docs/en/licensed_annotators.md b/docs/en/licensed_annotators.md index 1a79d5e5de..29c9a8d24c 100644 --- a/docs/en/licensed_annotators.md +++ b/docs/en/licensed_annotators.md @@ -69,12 +69,15 @@ Check out the [Spark NLP Annotators page](https://nlp.johnsnowlabs.com/docs/en/a {% include templates/licensed_table_entry.md name="EntityRulerInternal" summary="This annotator match exact strings or regex patterns provided in a file against a Document and assigns them an named entity."%} {% include templates/licensed_table_entry.md name="FeaturesAssembler" summary="Collects features from different columns."%} {% include templates/licensed_table_entry.md name="FewShotClassifier" summary="This Annotator specifically target few-shot classification tasks, which involve training a model to make accurate predictions with limited labeled data."%} +{% include templates/licensed_table_entry.md name="FewShotAssertionClassifierModel" summary="assertion classification using large (LLMS based) few shot classifiers based on the SetFit approach."%} {% include templates/licensed_table_entry.md name="Flattener" summary="`Flattener` annotator in Spark NLP converts annotation results into a simplified DataFrame format for easier analysis and interpretation."%} {% include templates/licensed_table_entry.md name="GenericClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%} {% include templates/licensed_table_entry.md name="GenericLogRegClassifier" summary="Is a derivative of GenericClassifier which implements a multinomial logistic regression."%} {% include templates/licensed_table_entry.md name="GenericSVMClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%} {% include templates/licensed_table_entry.md name="InternalDocumentSplitter" summary="This annotator splits large documents into small documents."%} {% include templates/licensed_table_entry.md name="IOBTagger" summary="Merges token tags and NER labels from chunks in the specified format."%} +{% include templates/licensed_table_entry.md name="LightDeIdentification" summary="Light version of DeIdentification."%} +{% include templates/licensed_table_entry.md name="MultiChunk2Doc" summary="Merges a given chunks to create a document."%} {% include templates/licensed_table_entry.md name="NameChunkObfuscator" summary="This annotator allows to transform a dataset with an Input Annotation of type CHUNK, into its obfuscated version of by obfuscating the given CHUNKS."%} {% include templates/licensed_table_entry.md name="NerChunker" summary="Extracts phrases that fits into a known pattern using the NER tags."%} {% include templates/licensed_table_entry.md name="NerConverterInternal" summary="Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label."%}