From c79ec599fe3757e9d75e33d21979c1e1dd25aa7d Mon Sep 17 00:00:00 2001 From: Yigit <110747690+yigitgull@users.noreply.github.com> Date: Fri, 10 May 2024 15:55:22 +0300 Subject: [PATCH] added posology/generic .md files. (#1197) --- docs/en/annotators.md | 2 + .../licensed_annotator_entries/GenericRE.md | 228 ++++++++++++++++++ .../licensed_annotator_entries/PosologyRE.md | 213 ++++++++++++++++ 3 files changed, 443 insertions(+) create mode 100644 docs/en/licensed_annotator_entries/GenericRE.md create mode 100644 docs/en/licensed_annotator_entries/PosologyRE.md diff --git a/docs/en/annotators.md b/docs/en/annotators.md index 53d5ef6fc7..f5fe008f28 100644 --- a/docs/en/annotators.md +++ b/docs/en/annotators.md @@ -64,6 +64,7 @@ There are two types of Annotators: {% include templates/anno_table_entry.md path="" name="EntityRuler" summary="Fits an Annotator to match exact strings or regex patterns provided in a file against a Document and assigns them an named entity."%} {% include templates/anno_table_entry.md path="" name="EmbeddingsFinisher" summary="Extracts embeddings from Annotations into a more easily usable form."%} {% include templates/anno_table_entry.md path="" name="Finisher" summary="Converts annotation results into a format that easier to use. It is useful to extract the results from Spark NLP Pipelines."%} +{% include templates/anno_table_entry.md path="" name="GenericRE" summary="Instantiated RelationExtractionModel for extracting relationships between any entities."%} {% include templates/anno_table_entry.md path="" name="GraphExtraction" summary="Extracts a dependency graph between entities."%} {% include templates/anno_table_entry.md path="" name="GraphFinisher" summary="Helper class to convert the knowledge graph from GraphExtraction into a generic format, such as RDF."%} {% include templates/anno_table_entry.md path="" name="ImageAssembler" summary="Prepares images read by Spark into a format that is processable by Spark NLP."%} @@ -79,6 +80,7 @@ There are two types of Annotators: {% include templates/anno_table_entry.md path="" name="NerOverwriter" summary="Overwrites entities of specified strings."%} {% include templates/anno_table_entry.md path="" name="Normalizer" summary="Removes all dirty characters from text following a regex pattern and transforms words based on a provided dictionary."%} {% include templates/anno_table_entry.md path="" name="NorvigSweeting Spellchecker" summary="Retrieves tokens and makes corrections automatically if not found in an English dictionary."%} +{% include templates/anno_table_entry.md path="" name="PosologyRE" summary="Instantiated RelationExtractionModel for extracting relationships between different recognized drug entitites."%} {% include templates/anno_table_entry.md path="" name="POSTagger (Part of speech tagger)" summary="Averaged Perceptron model to tag words part-of-speech."%} {% include templates/anno_table_entry.md path="" name="RecursiveTokenizer" summary="Tokenizes raw text recursively based on a handful of definable rules."%} {% include templates/anno_table_entry.md path="" name="RegexMatcher" summary="Uses rules to match a set of regular expressions and associate them with a provided identifier."%} diff --git a/docs/en/licensed_annotator_entries/GenericRE.md b/docs/en/licensed_annotator_entries/GenericRE.md new file mode 100644 index 0000000000..72b873f736 --- /dev/null +++ b/docs/en/licensed_annotator_entries/GenericRE.md @@ -0,0 +1,228 @@ +{%- capture title -%} +GenericREModel +{%- endcapture -%} + + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +Instantiated RelationExtractionModel for extracting relationships between any entities. +This class is not intended to be directly used, please use the RelationExtractionModel instead. +Pairs of entities should be specified using setRelationPairs. + + +{%- endcapture -%} + +{%- capture model_input_anno -%} +WORD_EMBEDDINGS, POS, CHUNK, DEPENDENCY +{%- endcapture -%} + +{%- capture model_output_anno -%} +CATEGORY +{%- endcapture -%} + +{%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical + +documenter = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = nlp.SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = nlp.Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +words_embedder = nlp.WordEmbeddingsModel()\ + .pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("embeddings") + +pos_tagger = nlp.PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +ner_tagger = medical.NerModel()\ + .pretrained("ner_posology", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_chunker = medical.NerConverterInternal()\ + .setInputCols(["sentences", "tokens", "ner_tags"])\ + .setOutputCol("ner_chunks") + +dependency_parser = nlp.DependencyParserModel()\ + .pretrained("dependency_conllu", "en")\ + .setInputCols(["sentences", "pos_tags", "tokens"])\ + .setOutputCol("dependencies") + +reModel = medical.RelationExtractionModel()\ + .pretrained("generic_re")\ + .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\ + .setOutputCol("relations")\ + .setRelationPairs(["problem-test", + "problem-treatment"])\ + .setMaxSyntacticDistance(4) + +pipeline = nlp.Pipeline(stages=[ + documenter, + sentencer, + tokenizer, + words_embedder, + pos_tagger, + ner_tagger, + ner_chunker, + dependency_parser, + reModel +]) + +text = """ +A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to +presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis +three years prior to presentation , associated with an acute hepatitis , and obesity with a body mass index +( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and +vomiting . Two weeks prior to presentation , she was treated with a five-day course of amoxicillin for a +respiratory tract infection . She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin +and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . Physical +examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was +benign with no tenderness , guarding , or rigidity . +""" +df = spark.createDataFrame([[text]]).toDF("text") +result = pipeline.fit(df).transform(df) + +# Show results +result.select(F.explode(F.arrays_zip( + result.relations.result, + result.relations.metadata)).alias("cols"))\ +.select( + F.expr("cols['1']['chunk1']").alias("chunk1"), + F.expr("cols['1']['chunk2']").alias("chunk2"), + F.expr("cols['1']['entity1']").alias("entity1"), + F.expr("cols['1']['entity2']").alias("entity2"), + F.expr("cols['0']").alias("relations"), + F.expr("cols['1']['confidence']").alias("confidence")).show(5, truncate=False) + ++-----------------+-------------+-------+-------+------------+----------+ +|chunk1 |chunk2 |entity1|entity2|relations |confidence| ++-----------------+-------------+-------+-------+------------+----------+ +|obesity |BMI |PROBLEM|TEST |PROBLEM-TEST|1.0 | +|a body mass index|BMI |PROBLEM|TEST |PROBLEM-TEST|1.0 | +|BMI |polyuria |TEST |PROBLEM|TEST-PROBLEM|1.0 | +|BMI |polydipsia |TEST |PROBLEM|TEST-PROBLEM|1.0 | +|BMI |poor appetite|TEST |PROBLEM|TEST-PROBLEM|1.0 | ++-----------------+-------------+-------+-------+------------+----------+ + +{%- endcapture -%} + +{%- capture model_scala_medical -%} +import spark.implicits._ + +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val words_embedder = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("sentences","tokens")) + .setOutputCol("embeddings") + +val pos_tagger = PerceptronModel.pretrained("pos_clinical","en","clinical/models") + .setInputCols(Array("sentences","tokens")) + .setOutputCol("pos_tags") + +val ner_tagger = MedicalNerModel.pretrained("ner_posology","en","clinical/models") + .setInputCols("sentences","tokens","embeddings") + .setOutputCol("ner_tags") + +val ner_chunker = new NerConverterInternal() + .setInputCols(Array("sentences","tokens","ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu","en") + .setInputCols(Array("sentences","pos_tags","tokens")) + .setOutputCol("dependencies") + +val reModel = RelationExtractionModel.pretrained("generic_re") + .setInputCols(Array("embeddings","pos_tags","ner_chunks","dependencies")) + .setOutputCol("relations") + .setRelationPairs(Array("problem-test","problem-treatment")) + .setMaxSyntacticDistance(4) + + +val pipeline = new Pipeline().setStages(Array( + documenter, + sentencer, + tokenizer, + words_embedder, + pos_tagger, + ner_tagger, + ner_chunker, + dependency_parser, + reModel )) + +val text = "A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to " + +"presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis " + +"three years prior to presentation , associated with an acute hepatitis , and obesity with a body mass index " + +"( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and " + +"vomiting . Two weeks prior to presentation , she was treated with a five-day course of amoxicillin for a " + +"respiratory tract infection . She was on metformin , glipizide , and dapagliflozin for T2DM and atorvastatin " + +"and gemfibrozil for HTG . She had been on dapagliflozin for six months at the time of presentation . Physical " + +"examination on presentation was significant for dry oral mucosa ; significantly , her abdominal examination was " + +"benign with no tenderness , guarding , or rigidity." + +val df = Seq(text) .toDF("text") +val result = pipeline.fit(df) .transform(df) + +// Show results + ++-----------------+-------------+-------+-------+------------+----------+ +|chunk1 |chunk2 |entity1|entity2|relations |confidence| ++-----------------+-------------+-------+-------+------------+----------+ +|obesity |BMI |PROBLEM|TEST |PROBLEM-TEST|1.0 | +|a body mass index|BMI |PROBLEM|TEST |PROBLEM-TEST|1.0 | +|BMI |polyuria |TEST |PROBLEM|TEST-PROBLEM|1.0 | +|BMI |polydipsia |TEST |PROBLEM|TEST-PROBLEM|1.0 | +|BMI |poor appetite|TEST |PROBLEM|TEST-PROBLEM|1.0 | ++-----------------+-------------+-------+-------+------------+----------+ + +{%- endcapture -%} + + +{%- capture model_api_link -%} +[RelationExtractionModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/re/RelationExtractionModel.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[RelationExtractionModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/re/relation_extraction/index.html#sparknlp_jsl.annotator.re.relation_extraction.RelationExtractionModel) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[RelationExtractionModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/RelationExtractionModel.ipynb) +{%- endcapture -%} + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +%} diff --git a/docs/en/licensed_annotator_entries/PosologyRE.md b/docs/en/licensed_annotator_entries/PosologyRE.md new file mode 100644 index 0000000000..77f990df73 --- /dev/null +++ b/docs/en/licensed_annotator_entries/PosologyRE.md @@ -0,0 +1,213 @@ +{%- capture title -%} +PosologyREModel +{%- endcapture -%} + + +{%- capture model -%} +model +{%- endcapture -%} + +{%- capture model_description -%} +Instantiated RelationExtractionModel for extracting relationships between different recognized drug entitites. +This class is not intended to be directly used, please use the RelationExtractionModel instead. +Possible values are "DRUG-DOSAGE", "DRUG-ADE", "DRUG-FORM", "DRUG-FREQUENCY", +"DRUG-ROUTE", "DRUG-REASON", "DRUG-STRENGTH", "DRUG-DURATION". + + + +{%- endcapture -%} + +{%- capture model_input_anno -%} +WORD_EMBEDDINGS, POS, CHUNK, DEPENDENCY +{%- endcapture -%} + +{%- capture model_output_anno -%} +CATEGORY +{%- endcapture -%} + +{%- capture model_python_medical -%} +from johnsnowlabs import nlp, medical + +documenter = nlp.DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = nlp.SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = nlp.Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +words_embedder = nlp.WordEmbeddingsModel()\ + .pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("embeddings") + +pos_tagger = nlp.PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +ner_tagger = medical.NerModel()\ + .pretrained("ner_posology", "en", "clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_chunker = medical.NerConverterInternal()\ + .setInputCols(["sentences", "tokens", "ner_tags"])\ + .setOutputCol("ner_chunks") + +dependency_parser = nlp.DependencyParserModel()\ + .pretrained("dependency_conllu", "en")\ + .setInputCols(["sentences", "pos_tags", "tokens"])\ + .setOutputCol("dependencies") + +reModel = medical.RelationExtractionModel()\ + .pretrained("posology_re")\ + .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\ + .setOutputCol("relations")\ + .setMaxSyntacticDistance(4) + +pipeline = nlp.Pipeline(stages=[ + documenter, + sentencer, + tokenizer, + words_embedder, + pos_tagger, + ner_tagger, + ner_chunker, + dependency_parser, + reModel +]) + +text = """ +The patient was prescribed 1 unit of Advil for 5 days after meals. The patient was also +given 1 unit of Metformin daily. +He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , +12 units of insulin lispro with meals , and metformin 1000 mg two times a day. +""" +df = spark.createDataFrame([[text]]).toDF("text") +result = pipeline.fit(df).transform(df) + +# Show results +result.select(F.explode(F.arrays_zip( + result.relations.result, + result.relations.metadata)).alias("cols"))\ +.select( + F.expr("cols['1']['chunk1']").alias("chunk1"), + F.expr("cols['1']['chunk2']").alias("chunk2"), + F.expr("cols['1']['entity1']").alias("entity1"), + F.expr("cols['1']['entity2']").alias("entity2"), + F.expr("cols['0']").alias("relations"), + F.expr("cols['1']['confidence']").alias("confidence")).show(5, truncate=False) + ++---------+----------------+-------+---------+--------------+----------+ +|chunk1 |chunk2 |entity1|entity2 |relations |confidence| ++---------+----------------+-------+---------+--------------+----------+ +|1 unit |Advil |DOSAGE |DRUG |DOSAGE-DRUG |1.0 | +|Advil |for 5 days |DRUG |DURATION |DRUG-DURATION |1.0 | +|1 unit |Metformin |DOSAGE |DRUG |DOSAGE-DRUG |1.0 | +|Metformin|daily |DRUG |FREQUENCY|DRUG-FREQUENCY|1.0 | +|40 units |insulin glargine|DOSAGE |DRUG |DOSAGE-DRUG |1.0 | ++---------+----------------+-------+---------+--------------+----------+ + +{%- endcapture -%} + +{%- capture model_scala_medical -%} +import spark.implicits._ + +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val words_embedder = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models") + .setInputCols(Array("sentences","tokens")) + .setOutputCol("embeddings") + +val pos_tagger = PerceptronModel.pretrained("pos_clinical","en","clinical/models") + .setInputCols(Array("sentences","tokens")) + .setOutputCol("pos_tags") + +val ner_tagger = MedicalNerModel.pretrained("ner_posology","en","clinical/models") + .setInputCols("sentences","tokens","embeddings") + .setOutputCol("ner_tags") + +val ner_chunker = new NerConverterInternal() + .setInputCols(Array("sentences","tokens","ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel.pretrained("dependency_conllu","en") + .setInputCols(Array("sentences","pos_tags","tokens")) + .setOutputCol("dependencies") + +val reModel = RelationExtractionModel.pretrained("posology_re") + .setInputCols(Array("embeddings","pos_tags","ner_chunks","dependencies")) + .setOutputCol("relations") + .setMaxSyntacticDistance(4) + +val pipeline = new Pipeline().setStages(Array( + documenter, + sentencer, + tokenizer, + words_embedder, + pos_tagger, + ner_tagger, + ner_chunker, + dependency_parser, + reModel )) + +val text = " The patient was prescribed 1 unit of Advil for 5 days after meals. The patient was also given 1 unit of Metformin daily. He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals ,and metformin 1000 mg two times a day. " + +val df = Seq(text) .toDF("text") +val result = pipeline.fit(df) .transform(df) + +// Show results + ++---------+----------------+-------+---------+--------------+----------+ +|chunk1 |chunk2 |entity1|entity2 |relations |confidence| ++---------+----------------+-------+---------+--------------+----------+ +|1 unit |Advil |DOSAGE |DRUG |DOSAGE-DRUG |1.0 | +|Advil |for 5 days |DRUG |DURATION |DRUG-DURATION |1.0 | +|1 unit |Metformin |DOSAGE |DRUG |DOSAGE-DRUG |1.0 | +|Metformin|daily |DRUG |FREQUENCY|DRUG-FREQUENCY|1.0 | +|40 units |insulin glargine|DOSAGE |DRUG |DOSAGE-DRUG |1.0 | ++---------+----------------+-------+---------+--------------+----------+ + +{%- endcapture -%} + + +{%- capture model_api_link -%} +[RelationExtractionModel](https://nlp.johnsnowlabs.com/licensed/api/com/johnsnowlabs/nlp/annotators/re/RelationExtractionModel.html) +{%- endcapture -%} + +{%- capture model_python_api_link -%} +[RelationExtractionModel](https://nlp.johnsnowlabs.com/licensed/api/python/reference/autosummary/sparknlp_jsl/annotator/re/relation_extraction/index.html#sparknlp_jsl.annotator.re.relation_extraction.RelationExtractionModel) +{%- endcapture -%} + +{%- capture model_notebook_link -%} +[RelationExtractionModelNotebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/Healthcare_MOOC/Spark_NLP_Udemy_MOOC/Healthcare_NLP/RelationExtractionModel.ipynb) +{%- endcapture -%} + + +{% include templates/licensed_approach_model_medical_fin_leg_template.md +title=title +model=model +model_description=model_description +model_input_anno=model_input_anno +model_output_anno=model_output_anno +model_python_medical=model_python_medical +model_scala_medical=model_scala_medical +model_api_link=model_api_link +model_python_api_link=model_python_api_link +model_notebook_link=model_notebook_link +%}