added .md file (#1128)

JohnSnowLabs · Apr 18, 2024 · 50c027e · 50c027e
1 parent 83b7a30
commit 50c027e
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 2 deletions.
diff --git a/docs/en/annotators.md b/docs/en/annotators.md
@@ -69,8 +69,6 @@ There are two types of Annotators:
 {% include templates/anno_table_entry.md path="" name="ImageAssembler" summary="Prepares images read by Spark into a format that is processable by Spark NLP."%}
 {% include templates/anno_table_entry.md path="" name="LanguageDetectorDL" summary="Language Identification and Detection by using CNN and RNN architectures in TensorFlow."%}
 {% include templates/anno_table_entry.md path="" name="Lemmatizer" summary="Finds lemmas out of words with the objective of returning a base dictionary word."%}
-{% include templates/licensed_table_entry.md  name="LightDeIdentification" summary="Light version of DeIdentification."%}
-{% include templates/licensed_table_entry.md  name="MultiChunk2Doc" summary="Merges a given chunks to create a document."%}
 {% include templates/anno_table_entry.md path="" name="MultiClassifierDL" summary="Multi-label Text Classification."%}
 {% include templates/anno_table_entry.md path="" name="MultiDateMatcher" summary="Matches standard date formats into a provided format."%}
 {% include templates/anno_table_entry.md path="" name="MultiDocumentAssembler" summary="Prepares data into a format that is processable by Spark NLP."%}

diff --git a/docs/en/licensed_annotator_entries/FewShotAssertionClassifierModel.md b/docs/en/licensed_annotator_entries/FewShotAssertionClassifierModel.md
@@ -0,0 +1,182 @@
+{%- capture title -%}
+FewShotAssertionClassifierModel
+{%- endcapture -%}
+
+{%- capture model -%}
+model
+{%- endcapture -%}
+
+{%- capture model_description -%}
+
+FewShotAssertionClassifierModel does assertion classification using can run large (LLMS based)
+few shot classifiers based on the SetFit approach.
+
+Parameters:
+
+- `batchSize` *(Int)*: Batch size
+
+- `caseSensitive` *(Bool)*: Whether the classifier is sensitive to text casing
+
+- `maxSentenceLength` *(Int)*: The maximum length of the input text
+
+
+{%- endcapture -%}
+
+{%- capture model_input_anno -%} 
+DOCUMENT, CHUNK
+{%- endcapture -%}
+
+{%- capture model_output_anno -%}
+ASSERTION
+{%- endcapture -%}
+
+{%- capture model_python_medical -%}
+
+from johnsnowlabs import nlp, medical
+
+document_assembler = nlp.DocumentAssembler()\
+     .setInputCol("text")\
+     .setOutputCol("document")
+
+ sentence_detector = nlp.SentenceDetector()\
+    .setInputCol("document")\
+    .setOutputCol("sentence")
+
+ tokenizer = nlp.Tokenizer()\
+    .setInputCols(["sentence"])\
+    .setOutputCol("token")
+
+ embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
+    .setInputCols(["sentence", "token"])\
+    .setOutputCol("embeddings") \
+    .setCaseSensitive(False)
+
+ ner = medical.NerModel.pretrained("ner_jsl", "en", "clinical/models") \
+    .setInputCols(["sentence", "token", "embeddings"]) \
+    .setOutputCol("ner")
+
+ ner_converter = medical.NerConverterInternal()\
+    .setInputCols(["sentence", "token", "ner"])\
+    .setWhiteList("Disease_Syndrome_Disorder", "Hypertension")\
+    .setOutputCol("ner_chunk")
+
+ few_shot_assertion_classifier = medical.FewShotAssertionClassifierModel().pretrained()\
+     .setInputCols(["sentence", "ner_chunk"])\
+     .setOutputCol("assertion")
+
+data = spark.createDataFrame(
+    [["Includes hypertension and chronic obstructive pulmonary disease."]]
+     ).toDF("text")
+
+results = Pipeline() \
+    .setStages([
+                document_assembler,
+                sentence_detector,
+                tokenizer, 
+                embeddings,
+                ner,
+                ner_converter,
+                few_shot_assertion_classifier]) \
+    .fit(data) \
+    .transform(data) \
+
+results.selectExpr("assertion.result", "assertion.metadata.chunk", "assertion.metadata.confidence").show()
+
+## Result
+
++--------+----------------------------+-----------+
+|  result|                       chunk| confidence|
++--------+----------------------------+-----------+
+| present|                hypertension|        1.0|
+|  absent| arteriovenous malformations|        1.0|
++--------+----------------------------+-----------+
+
+{%- endcapture -%}
+
+{%- capture model_scala_medical -%}
+import spark.implicits._
+
+val documentAssembler = new DocumentAssembler()
+    .setInputCol("text")
+    .setOutputCol("document")
+
+val sentenceDetector = new SentenceDetector()
+    .setInputCols(Array("document"))
+    .setOutputCol("sentences")
+
+val tokenizer = Tokenizer()
+    .setInputCols(Array("sentence"))
+    .setOutputCol("token")
+
+val embeddings = WordEmbeddingsModel
+    .pretrained("embeddings_clinical", "en", "clinical/models")
+    .setInputCols(Array("sentence", "token"))
+    .setOutputCol("embeddings")
+    .setCaseSensitive(False)
+
+val ner = MedicalNerModel
+    .pretrained("ner_jsl", "en", "clinical/models")
+    .setInputCols(["sentence", "token", "embeddings"])
+    .setOutputCol("ner")
+
+val nerConverter = NerConverter()
+    .setInputCols(Array("sentence", "token", "ner"))
+    .setWhiteList("Disease_Syndrome_Disorder", "Hypertension")
+    .setOutputCol("ner_chunk")
+
+val fewShotAssertionClassifier = LargeFewShotClassifierModel
+    .pretrained("clinical_assertion")
+    .setInputCols(Array("sentence"))
+    .setBatchSize(1)
+    .setOutputCol("label")
+
+val pipeline = new Pipeline()
+    .setStages(Array(
+        documentAssembler,
+        sentenceDetector, 
+        tokenizer,
+        embeddings,
+        ner, 
+        nerConverter,
+        fewShotAssertionClassifier))
+
+val model = pipeline.fit(Seq().toDS.toDF("text"))
+val results = model.transform(Seq("Includes hypertension and chronic obstructive pulmonary disease.").toDS.toDF("text"))
+
+results.selectExpr("explode(assertion) as assertion")
+       .selectExpr("assertion.result", "assertion.metadata.chunk", "assertion.metadata.confidence")
+       .show(truncate = false)
+
+// Result       
+
++-------+-------------------------------------+----------+
+|result |chunk                                |confidence|
++-------+-------------------------------------+----------+
+|present|hypertension                         |1.0       |
+|present|chronic obstructive pulmonary disease|1.0       |
+|absent |arteriovenous malformations          |1.0       |
+|absent |vascular malformation                |0.9999997 |
++-------+-------------------------------------+----------+
+
+{%- endcapture -%}
+
+{%- capture model_api_link -%}
+[FewShotAssertionClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/FewShotAssertionClassifierModel.html)
+{%- endcapture -%}
+
+{%- capture model_python_api_link -%}
+[FewShotAssertionClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/few_shot_assertion_classifier/index.html)
+{%- endcapture -%}
+
+
+{% include templates/licensed_approach_model_medical_fin_leg_template.md
+title=title
+model=model
+model_description=model_description
+model_input_anno=model_input_anno
+model_output_anno=model_output_anno
+model_python_medical=model_python_medical
+model_scala_medical=model_scala_medical
+model_api_link=model_api_link
+model_python_api_link=model_python_api_link
+%}
diff --git a/docs/en/licensed_annotators.md b/docs/en/licensed_annotators.md
@@ -69,12 +69,15 @@ Check out the [Spark NLP Annotators page](https://nlp.johnsnowlabs.com/docs/en/a
 {% include templates/licensed_table_entry.md  name="EntityRulerInternal" summary="This annotator match exact strings or regex patterns provided in a file against a Document and assigns them an named entity."%}
 {% include templates/licensed_table_entry.md  name="FeaturesAssembler" summary="Collects features from different columns."%}
 {% include templates/licensed_table_entry.md  name="FewShotClassifier" summary="This Annotator specifically target few-shot classification tasks, which involve training a model to make accurate predictions with limited labeled data."%}
+{% include templates/licensed_table_entry.md  name="FewShotAssertionClassifierModel" summary="assertion classification using large (LLMS based) few shot classifiers based on the SetFit approach."%}
 {% include templates/licensed_table_entry.md  name="Flattener" summary="`Flattener` annotator in Spark NLP converts annotation results into a simplified DataFrame format for easier analysis and interpretation."%}
 {% include templates/licensed_table_entry.md  name="GenericClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%}
 {% include templates/licensed_table_entry.md  name="GenericLogRegClassifier" summary="Is a derivative of GenericClassifier which implements a multinomial logistic regression."%}
 {% include templates/licensed_table_entry.md  name="GenericSVMClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%}
 {% include templates/licensed_table_entry.md  name="InternalDocumentSplitter" summary="This annotator splits large documents into small documents."%}
 {% include templates/licensed_table_entry.md  name="IOBTagger" summary="Merges token tags and NER labels from chunks in the specified format."%}
+{% include templates/licensed_table_entry.md  name="LightDeIdentification" summary="Light version of DeIdentification."%}
+{% include templates/licensed_table_entry.md  name="MultiChunk2Doc" summary="Merges a given chunks to create a document."%}
 {% include templates/licensed_table_entry.md  name="NameChunkObfuscator" summary="This annotator allows to transform a dataset with an Input Annotation of type CHUNK, into its obfuscated version of by obfuscating the given CHUNKS."%}
 {% include templates/licensed_table_entry.md  name="NerChunker" summary="Extracts phrases that fits into a known pattern using the NER tags."%}
 {% include templates/licensed_table_entry.md  name="NerConverterInternal" summary="Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label."%}