Skip to content

Commit

Permalink
added .md file (#1128)
Browse files Browse the repository at this point in the history
  • Loading branch information
yigitgull committed Apr 18, 2024
1 parent 83b7a30 commit 50c027e
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 2 deletions.
2 changes: 0 additions & 2 deletions docs/en/annotators.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@ There are two types of Annotators:
{% include templates/anno_table_entry.md path="" name="ImageAssembler" summary="Prepares images read by Spark into a format that is processable by Spark NLP."%}
{% include templates/anno_table_entry.md path="" name="LanguageDetectorDL" summary="Language Identification and Detection by using CNN and RNN architectures in TensorFlow."%}
{% include templates/anno_table_entry.md path="" name="Lemmatizer" summary="Finds lemmas out of words with the objective of returning a base dictionary word."%}
{% include templates/licensed_table_entry.md name="LightDeIdentification" summary="Light version of DeIdentification."%}
{% include templates/licensed_table_entry.md name="MultiChunk2Doc" summary="Merges a given chunks to create a document."%}
{% include templates/anno_table_entry.md path="" name="MultiClassifierDL" summary="Multi-label Text Classification."%}
{% include templates/anno_table_entry.md path="" name="MultiDateMatcher" summary="Matches standard date formats into a provided format."%}
{% include templates/anno_table_entry.md path="" name="MultiDocumentAssembler" summary="Prepares data into a format that is processable by Spark NLP."%}
Expand Down
182 changes: 182 additions & 0 deletions docs/en/licensed_annotator_entries/FewShotAssertionClassifierModel.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
{%- capture title -%}
FewShotAssertionClassifierModel
{%- endcapture -%}

{%- capture model -%}
model
{%- endcapture -%}

{%- capture model_description -%}

FewShotAssertionClassifierModel does assertion classification using can run large (LLMS based)
few shot classifiers based on the SetFit approach.

Parameters:

- `batchSize` *(Int)*: Batch size

- `caseSensitive` *(Bool)*: Whether the classifier is sensitive to text casing

- `maxSentenceLength` *(Int)*: The maximum length of the input text


{%- endcapture -%}

{%- capture model_input_anno -%}
DOCUMENT, CHUNK
{%- endcapture -%}

{%- capture model_output_anno -%}
ASSERTION
{%- endcapture -%}

{%- capture model_python_medical -%}

from johnsnowlabs import nlp, medical

document_assembler = nlp.DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")

sentence_detector = nlp.SentenceDetector()\
.setInputCol("document")\
.setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
.setInputCols(["sentence"])\
.setOutputCol("token")

embeddings = nlp.WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
.setInputCols(["sentence", "token"])\
.setOutputCol("embeddings") \
.setCaseSensitive(False)

ner = medical.NerModel.pretrained("ner_jsl", "en", "clinical/models") \
.setInputCols(["sentence", "token", "embeddings"]) \
.setOutputCol("ner")

ner_converter = medical.NerConverterInternal()\
.setInputCols(["sentence", "token", "ner"])\
.setWhiteList("Disease_Syndrome_Disorder", "Hypertension")\
.setOutputCol("ner_chunk")

few_shot_assertion_classifier = medical.FewShotAssertionClassifierModel().pretrained()\
.setInputCols(["sentence", "ner_chunk"])\
.setOutputCol("assertion")

data = spark.createDataFrame(
[["Includes hypertension and chronic obstructive pulmonary disease."]]
).toDF("text")

results = Pipeline() \
.setStages([
document_assembler,
sentence_detector,
tokenizer,
embeddings,
ner,
ner_converter,
few_shot_assertion_classifier]) \
.fit(data) \
.transform(data) \

results.selectExpr("assertion.result", "assertion.metadata.chunk", "assertion.metadata.confidence").show()

## Result

+--------+----------------------------+-----------+
| result| chunk| confidence|
+--------+----------------------------+-----------+
| present| hypertension| 1.0|
| absent| arteriovenous malformations| 1.0|
+--------+----------------------------+-----------+

{%- endcapture -%}

{%- capture model_scala_medical -%}
import spark.implicits._

val documentAssembler = new DocumentAssembler()
.setInputCol("text")
.setOutputCol("document")

val sentenceDetector = new SentenceDetector()
.setInputCols(Array("document"))
.setOutputCol("sentences")

val tokenizer = Tokenizer()
.setInputCols(Array("sentence"))
.setOutputCol("token")

val embeddings = WordEmbeddingsModel
.pretrained("embeddings_clinical", "en", "clinical/models")
.setInputCols(Array("sentence", "token"))
.setOutputCol("embeddings")
.setCaseSensitive(False)

val ner = MedicalNerModel
.pretrained("ner_jsl", "en", "clinical/models")
.setInputCols(["sentence", "token", "embeddings"])
.setOutputCol("ner")

val nerConverter = NerConverter()
.setInputCols(Array("sentence", "token", "ner"))
.setWhiteList("Disease_Syndrome_Disorder", "Hypertension")
.setOutputCol("ner_chunk")

val fewShotAssertionClassifier = LargeFewShotClassifierModel
.pretrained("clinical_assertion")
.setInputCols(Array("sentence"))
.setBatchSize(1)
.setOutputCol("label")

val pipeline = new Pipeline()
.setStages(Array(
documentAssembler,
sentenceDetector,
tokenizer,
embeddings,
ner,
nerConverter,
fewShotAssertionClassifier))

val model = pipeline.fit(Seq().toDS.toDF("text"))
val results = model.transform(Seq("Includes hypertension and chronic obstructive pulmonary disease.").toDS.toDF("text"))

results.selectExpr("explode(assertion) as assertion")
.selectExpr("assertion.result", "assertion.metadata.chunk", "assertion.metadata.confidence")
.show(truncate = false)

// Result

+-------+-------------------------------------+----------+
|result |chunk |confidence|
+-------+-------------------------------------+----------+
|present|hypertension |1.0 |
|present|chronic obstructive pulmonary disease|1.0 |
|absent |arteriovenous malformations |1.0 |
|absent |vascular malformation |0.9999997 |
+-------+-------------------------------------+----------+

{%- endcapture -%}

{%- capture model_api_link -%}
[FewShotAssertionClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/classification/FewShotAssertionClassifierModel.html)
{%- endcapture -%}

{%- capture model_python_api_link -%}
[FewShotAssertionClassifierModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/classification/few_shot_assertion_classifier/index.html)
{%- endcapture -%}


{% include templates/licensed_approach_model_medical_fin_leg_template.md
title=title
model=model
model_description=model_description
model_input_anno=model_input_anno
model_output_anno=model_output_anno
model_python_medical=model_python_medical
model_scala_medical=model_scala_medical
model_api_link=model_api_link
model_python_api_link=model_python_api_link
%}
3 changes: 3 additions & 0 deletions docs/en/licensed_annotators.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,15 @@ Check out the [Spark NLP Annotators page](https://nlp.johnsnowlabs.com/docs/en/a
{% include templates/licensed_table_entry.md name="EntityRulerInternal" summary="This annotator match exact strings or regex patterns provided in a file against a Document and assigns them an named entity."%}
{% include templates/licensed_table_entry.md name="FeaturesAssembler" summary="Collects features from different columns."%}
{% include templates/licensed_table_entry.md name="FewShotClassifier" summary="This Annotator specifically target few-shot classification tasks, which involve training a model to make accurate predictions with limited labeled data."%}
{% include templates/licensed_table_entry.md name="FewShotAssertionClassifierModel" summary="assertion classification using large (LLMS based) few shot classifiers based on the SetFit approach."%}
{% include templates/licensed_table_entry.md name="Flattener" summary="`Flattener` annotator in Spark NLP converts annotation results into a simplified DataFrame format for easier analysis and interpretation."%}
{% include templates/licensed_table_entry.md name="GenericClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%}
{% include templates/licensed_table_entry.md name="GenericLogRegClassifier" summary="Is a derivative of GenericClassifier which implements a multinomial logistic regression."%}
{% include templates/licensed_table_entry.md name="GenericSVMClassifier" summary="Creates a generic single-label classifier which uses pre-generated Tensorflow graphs."%}
{% include templates/licensed_table_entry.md name="InternalDocumentSplitter" summary="This annotator splits large documents into small documents."%}
{% include templates/licensed_table_entry.md name="IOBTagger" summary="Merges token tags and NER labels from chunks in the specified format."%}
{% include templates/licensed_table_entry.md name="LightDeIdentification" summary="Light version of DeIdentification."%}
{% include templates/licensed_table_entry.md name="MultiChunk2Doc" summary="Merges a given chunks to create a document."%}
{% include templates/licensed_table_entry.md name="NameChunkObfuscator" summary="This annotator allows to transform a dataset with an Input Annotation of type CHUNK, into its obfuscated version of by obfuscating the given CHUNKS."%}
{% include templates/licensed_table_entry.md name="NerChunker" summary="Extracts phrases that fits into a known pattern using the NER tags."%}
{% include templates/licensed_table_entry.md name="NerConverterInternal" summary="Converts a IOB or IOB2 representation of NER to a user-friendly one, by associating the tokens of recognized entities and their label."%}
Expand Down

0 comments on commit 50c027e

Please sign in to comment.