From ef593c80ffd9f60105fb86bcdcd5bbc02bb1b135 Mon Sep 17 00:00:00 2001 From: ArshaanNazir <71844877+ArshaanNazir@users.noreply.github.com> Date: Thu, 10 Nov 2022 23:16:38 +0530 Subject: [PATCH] doc fix in old hc md files (#13025) * doc fix in old hc md files * Update 2021-03-29-recognize_entities_posology_en.md * Update 2021-01-18-ner_radiology_en.md * Update 2022-10-01-assertion_oncology_demographic_binary_wip_en.md * Update 2021-01-29-deidentify_enriched_clinical_en.md * updates * Update 2021-01-29-ner_drugs_large_en.md and 2021-07-23-cantemist_scielowiki_es.md * Update 2021-01-29-deidentify_enriched_clinical_en.md * Update 2021-01-18-ner_radiology_en.md * Update 2021-01-18-re_bodypart_directions_en.md * Update 2021-01-18-re_bodypart_directions_en.md * Update 2021-01-18-re_bodypart_problem_en.md * Update 2021-01-18-re_bodypart_proceduretest_en.md * Update 2021-01-18-re_date_clinical_en.md * Update 2021-01-20-ner_deid_augmented_en.md * Update 2021-01-18-re_bodypart_directions_en.md * Update 2021-01-18-re_bodypart_problem_en.md * Update 2021-01-18-re_bodypart_directions_en.md * Update 2021-01-18-re_bodypart_problem_en.md * Update 2022-07-28-bert_token_classifier_ner_pathogen_en_3_0.md * Update 2022-08-13-ner_negation_uncertainty_es_3_0.md * Update 2022-08-13-ner_pharmacology_es_3_0.md * Update 2022-08-14-disease_mentions_tweet_es_3_0.md * Update 2022-09-29-redl_oncology_location_biobert_wip_en.md * Update 2022-09-30-ner_oncology_diagnosis_wip_en.md * Update 2022-10-01-assertion_oncology_demographic_binary_wip_en.md * Update 2022-10-01-assertion_oncology_family_history_wip_en.md * Update 2022-10-01-assertion_oncology_problem_wip_en.md * Update 2022-10-01-assertion_oncology_response_to_treatment_wip_en.md * Update 2022-10-01-assertion_oncology_smoking_status_wip_en.md * Update 2022-10-01-assertion_oncology_test_binary_wip_en.md * Update 2022-10-01-assertion_oncology_treatment_binary_wip_en.md * Update 2022-10-01-assertion_oncology_wip_en.md * Update 2022-10-11-assertion_oncology_demographic_binary_wip_en.md * Update 2022-10-11-assertion_oncology_family_history_wip_en.md * Update 2022-10-11-assertion_oncology_problem_wip_en.md * Update 2022-10-11-assertion_oncology_response_to_treatment_wip_en.md * Update 2022-10-11-assertion_oncology_smoking_status_wip_en.md * Update 2022-10-11-assertion_oncology_treatment_binary_wip_en.md * Update 2022-10-11-assertion_oncology_wip_en.md * Update 2022-09-29-redl_oncology_location_biobert_wip_en.md * Update 2022-10-01-assertion_oncology_family_history_wip_en.md * Update 2022-10-01-assertion_oncology_problem_wip_en.md * Update 2022-03-22-bert_token_classifier_drug_development_trials_en_3_0.md * 2022-06-26-drug_brandname_ndc_mapper_en_3_0.md * Update 2022-06-26-icd10cm_snomed_mapper_en_3_0.md * update 2022-06-26-icd10cm_umls_mapper_en_3_0.md * Update 2022-06-26-icdo_snomed_mapper_en_3_0.md * Update 2022-06-22-ner_living_species_bert_pt_3_0.md * Update 2022-06-22-ner_living_species_bert_es_3_0.md * Update 2022-06-22-ner_living_species_bert_pt_3_0.md * Update 2022-06-22-ner_living_species_biobert_en_3_0.md * Update 2022-06-22-ner_living_species_en_3_0.md * Update 2022-06-22-ner_living_species_es_3_0.md * Update 2022-06-22-ner_living_species_pt_3_0.md * Update 2022-06-22-ner_living_species_roberta_es_3_0.md * Update 2022-06-22-ner_living_species_roberta_pt_3_0.md * Updated md * Updates * Update 2021-01-18-re_bodypart_directions_en.md Co-authored-by: Cabir C <64752006+Cabir40@users.noreply.github.com> Co-authored-by: Vildan <64216738+Meryem1425@users.noreply.github.com> --- ...21-03-29-recognize_entities_posology_en.md | 195 ++--- ...assifier_drug_development_trials_en_3_0.md | 4 +- ...-06-26-drug_brandname_ndc_mapper_en_3_0.md | 2 +- ...2022-06-26-icd10cm_snomed_mapper_en_3_0.md | 4 +- .../2022-06-26-icd10cm_umls_mapper_en_3_0.md | 4 +- .../2022-06-26-icdo_snomed_mapper_en_3_0.md | 4 +- ...-28-drug_action_treatment_mapper_en_3_0.md | 4 +- .../2021-01-18-ner_radiology_en.md | 313 ++++---- .../2021-01-18-re_bodypart_directions_en.md | 342 +++++---- .../2021-01-18-re_bodypart_problem_en.md | 296 +++++--- ...2021-01-18-re_bodypart_proceduretest_en.md | 313 ++++---- .../2021-01-18-re_date_clinical_en.md | 298 +++++--- .../2021-01-20-ner_deid_augmented_en.md | 316 ++++---- ...22-06-22-ner_living_species_bert_es_3_0.md | 6 +- ...22-06-22-ner_living_species_bert_pt_3_0.md | 6 +- ...06-22-ner_living_species_biobert_en_3_0.md | 6 +- .../2022-06-22-ner_living_species_en_3_0.md | 6 +- .../2022-06-22-ner_living_species_es_3_0.md | 6 +- .../2022-06-22-ner_living_species_pt_3_0.md | 10 +- ...06-22-ner_living_species_roberta_es_3_0.md | 10 +- ...06-22-ner_living_species_roberta_pt_3_0.md | 10 +- ...22-06-23-ner_living_species_bert_fr_3_0.md | 10 +- ...22-06-23-ner_living_species_bert_it_3_0.md | 10 +- ...22-06-23-ner_living_species_bert_ro_3_0.md | 10 +- .../2022-06-23-ner_living_species_ca_3_0.md | 10 +- .../2022-06-23-ner_living_species_fr_3_0.md | 10 +- .../2022-06-23-ner_living_species_gl_3_0.md | 12 +- .../2022-06-23-ner_living_species_it_3_0.md | 10 +- ...en_classifier_ner_living_species_en_3_0.md | 10 +- ...en_classifier_ner_living_species_es_3_0.md | 10 +- ...en_classifier_ner_living_species_it_3_0.md | 10 +- ...en_classifier_ner_living_species_pt_3_0.md | 10 +- ...bert_token_classifier_ner_anatem_en_3_0.md | 12 +- ..._token_classifier_ner_bc2gm_gene_en_3_0.md | 12 +- ...lassifier_ner_bc4chemd_chemicals_en_3_0.md | 12 +- ..._classifier_ner_bc5cdr_chemicals_en_3_0.md | 12 +- ...en_classifier_ner_bc5cdr_disease_en_3_0.md | 11 +- ...n_classifier_ner_jnlpba_cellular_en_3_0.md | 12 +- ..._classifier_ner_linnaeus_species_en_3_0.md | 12 +- ...oken_classifier_ner_ncbi_disease_en_3_0.md | 12 +- ...ert_token_classifier_ner_species_en_3_0.md | 12 +- ...2-10-25-legclf_bert_swiss_judgements_de.md | 4 +- ...2-10-25-legclf_bert_swiss_judgements_en.md | 3 +- ...2-10-25-legclf_bert_swiss_judgements_fr.md | 4 +- ...2-10-25-legclf_bert_swiss_judgements_it.md | 4 +- ...-10-25-legner_indian_court_judgement_en.md | 10 +- ...2-10-25-legner_indian_court_preamble_en.md | 9 +- ...2-10-27-legclf_bert_swiss_judgements_de.md | 4 +- ...2-10-27-legclf_bert_swiss_judgements_en.md | 3 +- ...2-10-27-legclf_bert_swiss_judgements_fr.md | 4 +- ...2-10-27-legclf_bert_swiss_judgements_it.md | 4 +- .../2021-01-29-ner_drugs_large_en.md | 301 ++++---- .../2021-07-23-cantemist_scielowiki_es.md | 681 +++++++++--------- ...lassifier_disease_mentions_tweet_es_3_0.md | 9 +- ...rt_token_classifier_ner_pathogen_en_3_0.md | 9 +- ...rt_token_classifier_pharmacology_es_3_0.md | 9 +- ...2-08-13-ner_negation_uncertainty_es_3_0.md | 18 +- .../2022-08-13-ner_pharmacology_es_3_0.md | 18 +- ...022-08-14-disease_mentions_tweet_es_3_0.md | 14 +- ...on_oncology_treatment_binary_wip_en_3_0.md | 4 +- .../2022-07-25-ner_oncology_wip_en_3_0.md | 4 +- .../2022-09-26-re_oncology_size_wip_en.md | 4 +- ...-27-re_oncology_biomarker_result_wip_en.md | 4 +- .../2022-09-27-re_oncology_granular_wip_en.md | 4 +- .../2022-09-27-re_oncology_location_wip_en.md | 4 +- .../2022-09-27-re_oncology_temporal_wip_en.md | 4 +- ...22-09-27-re_oncology_test_result_wip_en.md | 4 +- .../2022-09-27-re_oncology_wip_en.md | 4 +- ...09-28-redl_oncology_size_biobert_wip_en.md | 4 +- ...2022-09-29-redl_oncology_biobert_wip_en.md | 4 +- ...ncology_biomarker_result_biobert_wip_en.md | 4 +- ...9-redl_oncology_granular_biobert_wip_en.md | 4 +- ...9-redl_oncology_location_biobert_wip_en.md | 13 +- ...9-redl_oncology_temporal_biobert_wip_en.md | 4 +- ...edl_oncology_test_result_biobert_wip_en.md | 4 +- ...-30-ner_oncology_anatomy_general_wip_en.md | 4 +- ...-09-30-ner_oncology_demographics_wip_en.md | 4 +- ...022-09-30-ner_oncology_diagnosis_wip_en.md | 10 +- .../2022-09-30-ner_oncology_test_wip_en.md | 4 +- .../2022-09-30-ner_oncology_therapy_wip_en.md | 8 +- .../2022-09-30-ner_oncology_tnm_wip_en.md | 4 +- ...ner_oncology_unspecific_posology_wip_en.md | 4 +- .../2022-09-30-ner_oncology_wip_en.md | 8 +- ...tion_oncology_demographic_binary_wip_en.md | 18 +- ...ssertion_oncology_family_history_wip_en.md | 21 +- ...10-01-assertion_oncology_problem_wip_en.md | 22 +- ...n_oncology_response_to_treatment_wip_en.md | 19 +- ...ssertion_oncology_smoking_status_wip_en.md | 19 +- ...1-assertion_oncology_test_binary_wip_en.md | 19 +- ...ertion_oncology_treatment_binary_wip_en.md | 19 +- .../2022-10-01-assertion_oncology_wip_en.md | 19 +- ...01-ner_oncology_anatomy_granular_wip_en.md | 4 +- ...022-10-01-ner_oncology_biomarker_wip_en.md | 4 +- ...2022-10-01-ner_oncology_posology_wip_en.md | 4 +- ...r_oncology_response_to_treatment_wip_en.md | 4 +- ...tion_oncology_demographic_binary_wip_en.md | 19 +- ...ssertion_oncology_family_history_wip_en.md | 19 +- ...10-11-assertion_oncology_problem_wip_en.md | 18 +- ...n_oncology_response_to_treatment_wip_en.md | 19 +- ...ssertion_oncology_smoking_status_wip_en.md | 16 +- ...ertion_oncology_treatment_binary_wip_en.md | 17 +- .../2022-10-11-assertion_oncology_wip_en.md | 19 +- ...1-01-29-deidentify_enriched_clinical_en.md | 292 ++++---- 103 files changed, 2238 insertions(+), 1954 deletions(-) diff --git a/docs/_posts/C-K-Loan/2021-03-29-recognize_entities_posology_en.md b/docs/_posts/C-K-Loan/2021-03-29-recognize_entities_posology_en.md index b1cc389f6fcf9..052a7ed7aa5e0 100644 --- a/docs/_posts/C-K-Loan/2021-03-29-recognize_entities_posology_en.md +++ b/docs/_posts/C-K-Loan/2021-03-29-recognize_entities_posology_en.md @@ -1,97 +1,98 @@ ---- -layout: model -title: Recognize Posology Pipeline -author: John Snow Labs -name: recognize_entities_posology -date: 2021-03-29 -tags: [ner, named_entity_recognition, pos, parts_of_speech, posology, ner_posology, pipeline, en, licensed] -task: [Named Entity Recognition, Part of Speech Tagging] -language: en -edition: Healthcare NLP 3.0.0 -spark_version: 3.0 -supported: true -article_header: -type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -This pipeline detects drugs, dosage, form, frequency, duration, route, and drug strength in text. - -## Predicted Entities -`DRUG`, `STRENGTH`, `DURATION`, `FREQUENCY`, `FORM`, `DOSAGE`, `ROUTE`. - -{:.btn-box} -[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_POSOLOGY/){:.button.button-orange} -[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/recognize_entities_posology_en_3.0.0_3.0_1617042229126.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - - - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} -```python -from sparknlp.pretrained import PretrainedPipelinein -pipeline = PretrainedPipeline('recognize_entities_posology', lang = 'en') -annotations = pipeline.fullAnnotate(""The patient was perscriped 50MG penicilin for is headache"")[0] -annotations.keys() - -``` -```scala - -val pipeline = new PretrainedPipeline("recognize_entities_posology", lang = "en") -val result = pipeline.fullAnnotate("The patient was perscriped 50MG penicilin for is headache")(0) - -``` - -{:.nlu-block} -```python -import nlu - -result_df = nlu.load('ner.posology').predict("The patient was perscriped 50MG penicilin for is headache") -result_df - -``` -
- -## Results - -```bash -+-----------------------------------------+ -|result | -+-----------------------------------------+ -|[O, O, O, O, B-Strength, B-Drug, O, O, O]||ner ||[[named_entity, 0, 2, O, [word -> The, confidence -> 1.0], []], [named_entity, 4, 10, O, [word -> patient, confidence -> 0.9993], []], [named_entity, 12, 14, O, [word -> was, confidence -> 1.0], []], [named_entity, 16, 25, O, [word -> perscriped, confidence -> 0.9985], []], [named_entity, 27, 30, B-Strength, [word -> 50MG, confidence -> 0.9966], []], [named_entity, 32, 40, B-Drug, [word -> penicilin, confidence -> 0.9934], []], [named_entity, 42, 44, O, [word -> for, confidence -> 0.9999], []], [named_entity, 46, 47, O, [word -> is, confidence -> 0.9468], []], [named_entity, 49, 56, O, [word -> headache, confidence -> 0.9805], []]]|``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|recognize_entities_posology| -|Type:|pipeline| -|Compatibility:|Healthcare NLP 3.0.0+| -|License:|Licensed| -|Edition:|Official| -|Language:|en| - -## Included Models - -- DocumentAssembler -- SentenceDetector -- TokenizerModel -- WordEmbeddingsModel -- NerDLModel -- NerConverter +--- +layout: model +title: Recognize Posology Pipeline +author: John Snow Labs +name: recognize_entities_posology +date: 2021-03-29 +tags: [ner, named_entity_recognition, pos, parts_of_speech, posology, ner_posology, pipeline, en, licensed] +task: [Named Entity Recognition, Part of Speech Tagging] +language: en +edition: Healthcare NLP 3.0.0 +spark_version: 3.0 +supported: true +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This pipeline detects drugs, dosage, form, frequency, duration, route, and drug strength in text. + +## Predicted Entities +`DRUG`, `STRENGTH`, `DURATION`, `FREQUENCY`, `FORM`, `DOSAGE`, `ROUTE`. + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_POSOLOGY/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/recognize_entities_posology_en_3.0.0_3.0_1617042229126.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +from sparknlp.pretrained import PretrainedPipelinein +pipeline = PretrainedPipeline('recognize_entities_posology', lang = 'en') +annotations = pipeline.fullAnnotate(""The patient was perscriped 50MG penicilin for is headache"")[0] +annotations.keys() + +``` +```scala + +val pipeline = new PretrainedPipeline("recognize_entities_posology", lang = "en") +val result = pipeline.fullAnnotate("The patient was perscriped 50MG penicilin for is headache")(0) + +``` + +{:.nlu-block} +```python +import nlu + +result_df = nlu.load('ner.posology').predict("The patient was perscriped 50MG penicilin for is headache") +result_df + +``` +
+ +## Results + +```bash ++-----------------------------------------+ +|result | ++-----------------------------------------+ +|[O, O, O, O, B-Strength, B-Drug, O, O, O]| ++-----------------------------------------+ + ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|ner ||[[named_entity, 0, 2, O, [word -> The, confidence -> 1.0], []], [named_entity, 4, 10, O, [word -> patient, confidence -> 0.9993], []], [named_entity, 12, 14, O, [word -> was, confidence -> 1.0], []], [named_entity, 16, 25, O, [word -> perscriped, confidence -> 0.9985], []], [named_entity, 27, 30, B-Strength, [word -> 50MG, confidence -> 0.9966], []], [named_entity, 32, 40, B-Drug, [word -> penicilin, confidence -> 0.9934], []], [named_entity, 42, 44, O, [word -> for, confidence -> 0.9999], []], [named_entity, 46, 47, O, [word -> is, confidence -> 0.9468], []], [named_entity, 49, 56, O, [word -> headache, confidence -> 0.9805], []]]|``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|recognize_entities_posology| +|Type:|pipeline| +|Compatibility:|Healthcare NLP 3.0.0+| +|License:|Licensed| +|Edition:|Official| +|Language:|en| + +## Included Models + +- DocumentAssembler +- SentenceDetector +- TokenizerModel +- WordEmbeddingsModel +- NerDLModel +- NerConverter + diff --git a/docs/_posts/Damla-Gurbaz/2022-03-22-bert_token_classifier_drug_development_trials_en_3_0.md b/docs/_posts/Damla-Gurbaz/2022-03-22-bert_token_classifier_drug_development_trials_en_3_0.md index be2eee2dc953e..a25b2eaf14e25 100644 --- a/docs/_posts/Damla-Gurbaz/2022-03-22-bert_token_classifier_drug_development_trials_en_3_0.md +++ b/docs/_posts/Damla-Gurbaz/2022-03-22-bert_token_classifier_drug_development_trials_en_3_0.md @@ -87,12 +87,12 @@ val documentAssembler = DocumentAssembler() val sentenceDetector = SentenceDetectorDLModel.pretrained() -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") diff --git a/docs/_posts/Damla-Gurbaz/2022-06-26-drug_brandname_ndc_mapper_en_3_0.md b/docs/_posts/Damla-Gurbaz/2022-06-26-drug_brandname_ndc_mapper_en_3_0.md index 78b1e8fde84b2..c6287ae4155ff 100644 --- a/docs/_posts/Damla-Gurbaz/2022-06-26-drug_brandname_ndc_mapper_en_3_0.md +++ b/docs/_posts/Damla-Gurbaz/2022-06-26-drug_brandname_ndc_mapper_en_3_0.md @@ -64,7 +64,7 @@ val document_assembler = new DocumentAssembler() .setOutputCol("chunk") val chunkerMapper = ChunkMapperModel.pretrained("drug_brandname_ndc_mapper", "en", "clinical/models") -.setInputCols("chunk") +.setInputCols(Array("chunk")) .setOutputCol("ndc") .setRels(Array("Strength_NDC")) .setLowerCase(True) diff --git a/docs/_posts/Damla-Gurbaz/2022-06-26-icd10cm_snomed_mapper_en_3_0.md b/docs/_posts/Damla-Gurbaz/2022-06-26-icd10cm_snomed_mapper_en_3_0.md index 777294ba4b4e7..58dccc8c38440 100644 --- a/docs/_posts/Damla-Gurbaz/2022-06-26-icd10cm_snomed_mapper_en_3_0.md +++ b/docs/_posts/Damla-Gurbaz/2022-06-26-icd10cm_snomed_mapper_en_3_0.md @@ -73,7 +73,7 @@ val documentAssembler = new DocumentAssembler() .setOutputCol("ner_chunk") val sbert_embedder = BertSentenceEmbeddings.pretrained("sbiobert_base_cased_mli", "en", "clinical/models") -.setInputCols("ner_chunk") +.setInputCols(Array("ner_chunk")) .setOutputCol("sbert_embeddings") val icd_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented_billable_hcc", "en", "clinical/models") @@ -82,7 +82,7 @@ val icd_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10 .setDistanceFunction("EUCLIDEAN") val chunkerMapper = ChunkMapperModel.pretrained("icd10cm_snomed_mapper", "en","clinical/models") -.setInputCols("icd10cm_code") +.setInputCols(Array("icd10cm_code")) .setOutputCol("mappings") .setRels(Array("snomed_code")) diff --git a/docs/_posts/Damla-Gurbaz/2022-06-26-icd10cm_umls_mapper_en_3_0.md b/docs/_posts/Damla-Gurbaz/2022-06-26-icd10cm_umls_mapper_en_3_0.md index 99b8783d90ce8..b9cb5244fc75e 100644 --- a/docs/_posts/Damla-Gurbaz/2022-06-26-icd10cm_umls_mapper_en_3_0.md +++ b/docs/_posts/Damla-Gurbaz/2022-06-26-icd10cm_umls_mapper_en_3_0.md @@ -77,7 +77,7 @@ val documentAssembler = new DocumentAssembler() val sbert_embedder = BertSentenceEmbeddings .pretrained("sbiobert_base_cased_mli", "en", "clinical/models") -.setInputCols("ner_chunk") +.setInputCols(Array("ner_chunk")) .setOutputCol("sbert_embeddings") val icd10cm_resolver = SentenceEntityResolverModel @@ -88,7 +88,7 @@ val icd10cm_resolver = SentenceEntityResolverModel val chunkerMapper = ChunkMapperModel .pretrained("icd10cm_umls_mapper", "en", "clinical/models") -.setInputCols("rxnorm_code") +.setInputCols(Array("rxnorm_code")) .setOutputCol("umls_mappings") .setRels(Array("umls_code")) diff --git a/docs/_posts/Damla-Gurbaz/2022-06-26-icdo_snomed_mapper_en_3_0.md b/docs/_posts/Damla-Gurbaz/2022-06-26-icdo_snomed_mapper_en_3_0.md index 4f61eaa6a92fa..c9ea713f3b295 100644 --- a/docs/_posts/Damla-Gurbaz/2022-06-26-icdo_snomed_mapper_en_3_0.md +++ b/docs/_posts/Damla-Gurbaz/2022-06-26-icdo_snomed_mapper_en_3_0.md @@ -78,7 +78,7 @@ val documentAssembler = new DocumentAssembler() val sbert_embedder = BertSentenceEmbeddings .pretrained("sbiobert_base_cased_mli", "en", "clinical/models") -.setInputCols("ner_chunk") +.setInputCols(Array("ner_chunk")) .setOutputCol("sbert_embeddings") val icdo_resolver = SentenceEntityResolverModel @@ -89,7 +89,7 @@ val icdo_resolver = SentenceEntityResolverModel val chunkerMapper = ChunkMapperModel .pretrained("icdo_snomed_mapper", "en", "clinical/models") -.setInputCols("icdo_code") +.setInputCols(Array("icdo_code")) .setOutputCol("snomed_mappings") .setRels(Array("snomed_code")) diff --git a/docs/_posts/Damla-Gurbaz/2022-06-28-drug_action_treatment_mapper_en_3_0.md b/docs/_posts/Damla-Gurbaz/2022-06-28-drug_action_treatment_mapper_en_3_0.md index cf0b38b2a0f33..b49b99d859b63 100644 --- a/docs/_posts/Damla-Gurbaz/2022-06-28-drug_action_treatment_mapper_en_3_0.md +++ b/docs/_posts/Damla-Gurbaz/2022-06-28-drug_action_treatment_mapper_en_3_0.md @@ -68,7 +68,7 @@ chunkerMapper_action = ChunkMapperModel.pretrained("drug_action_treatment_mapper .setRels(["action"])\ .setLowerCase(True) -chunkerMapper_treatment = ChunkMapperModel.pretrained("drug_action_treatment_mapper", , "en", "clinical/models")\ +chunkerMapper_treatment = ChunkMapperModel.pretrained("drug_action_treatment_mapper", "en", "clinical/models")\ .setInputCols(["ner_chunk"])\ .setOutputCol("treatment_mappings")\ .setRels(["treatment"])\ @@ -126,7 +126,7 @@ val chunkerMapper_action = ChunkMapperModel.pretrained("drug_action_treatment_ma .setRels(Array("action")) .setLowerCase(True) -val chunkerMapper_treatment = ChunkMapperModel.pretrained("drug_action_treatment_mapper", , "en", "clinical/models") +val chunkerMapper_treatment = ChunkMapperModel.pretrained("drug_action_treatment_mapper", "en", "clinical/models") .setInputCols(Array("ner_chunk")) .setOutputCol("treatment_mappings") .setRels(Array("treatment")) diff --git a/docs/_posts/aydinmyilmaz/2021-01-18-ner_radiology_en.md b/docs/_posts/aydinmyilmaz/2021-01-18-ner_radiology_en.md index 49237b40041ca..622f6f123bf42 100644 --- a/docs/_posts/aydinmyilmaz/2021-01-18-ner_radiology_en.md +++ b/docs/_posts/aydinmyilmaz/2021-01-18-ner_radiology_en.md @@ -1,142 +1,171 @@ ---- -layout: model -title: Detect Radiology Related Entities -author: John Snow Labs -name: ner_radiology -date: 2021-01-18 -task: Named Entity Recognition -language: en -edition: Healthcare NLP 2.7.0 -spark_version: 2.4 -tags: [en, ner, licensed, clinical] -supported: true -article_header: -type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -Pretrained named entity recognition deep learning model for radiology related texts and reports. - -## Predicted Entities - -`ImagingTest`, `Imaging_Technique`, `ImagingFindings`, `OtherFindings`, `BodyPart`, `Direction`, `Test`, `Symptom`, `Disease_Syndrome_Disorder`, `Medical_Device`, `Procedure`, `Measurements`, `Units` - -{:.btn-box} -[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_RADIOLOGY/){:.button.button-orange} -[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_radiology_en_2.7.0_2.4_1610995075088.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - -Use as part of an NLP pipeline with the following stages: DocumentAssembler, SentenceDetector, Tokenizer, WordEmbeddingsModel, NerDLModel. Add the NerConverter to the end of the pipeline to convert entity tokens into full entity chunks. - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} -```python -... -radiology_ner = NerDLModel.pretrained("ner_radiology", "en", "clinical/models") \ -.setInputCols(["sentence", "token", "embeddings"]) \ -.setOutputCol("ner") -nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, radiology_ner, ner_converter]) -... -model = nlpPipeline.fit(spark.createDataFrame([['''Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial aspect of the left shoulder. This mass demonstrates isoechoic echotexture to the adjacent muscle, with no evidence of internal color flow. This may represent benign fibrous tissue or a lipoma.''']]).toDF("text")) - -results = model.transform(data) -``` - -```scala -... -val radiology_ner = NerDLModel().pretrained("ner_radiology", "en", "clinical/models") -.setInputCols(Array("sentence", "token", "embeddings")) -.setOutputCol("ner") - -val nlpPipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, radiology_ner, ner_converter)) -val result = pipeline.fit(Seq.empty[String]).transform(data) - -``` - - -{:.nlu-block} -```python -import nlu -nlu.load("en.med_ner.radiology").predict("""Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial aspect of the left shoulder. This mass demonstrates isoechoic echotexture to the adjacent muscle, with no evidence of internal color flow. This may represent benign fibrous tissue or a lipoma.""") -``` - -
- -## Results - -```bash -| | chunks | entities | -|----|-----------------------|---------------------------| -| 0 | Bilateral | Direction | -| 1 | breast | BodyPart | -| 2 | ultrasound | ImagingTest | -| 3 | ovoid mass | ImagingFindings | -| 4 | 0.5 x 0.5 x 0.4 | Measurements | -| 5 | cm | Units | -| 6 | anteromedial aspect | Direction | -| 7 | left | Direction | -| 8 | shoulder | BodyPart | -| 9 | mass | ImagingFindings | -| 10 | isoechoic echotexture | ImagingFindings | -| 11 | muscle | BodyPart | -| 12 | internal color flow | ImagingFindings | -| 13 | benign fibrous tissue | ImagingFindings | -| 14 | lipoma | Disease_Syndrome_Disorder | -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|ner_radiology| -|Type:|ner| -|Compatibility:|Spark NLP 2.7.0+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[sentence, token, embeddings]| -|Output Labels:|[ner]| -|Language:|en| -|Dependencies:|embeddings_clinical| - -## Data Source - -Trained on a custom dataset comprising of MIMIC-CXR and MT Radiology texts - -## Benchmarking - -```bash -+--------------------+------+-----+-----+------+---------+------+------+ -| entity| tp| fp| fn| total|precision|recall| f1| -+--------------------+------+-----+-----+------+---------+------+------+ -| OtherFindings| 8.0| 15.0| 63.0| 71.0| 0.3478|0.1127|0.1702| -| Measurements| 481.0| 30.0| 15.0| 496.0| 0.9413|0.9698|0.9553| -| Direction| 650.0|137.0| 94.0| 744.0| 0.8259|0.8737|0.8491| -| ImagingFindings|1345.0|355.0|324.0|1669.0| 0.7912|0.8059|0.7985| -| BodyPart|1942.0|335.0|290.0|2232.0| 0.8529|0.8701|0.8614| -| Medical_Device| 236.0| 75.0| 64.0| 300.0| 0.7588|0.7867|0.7725| -| Test| 222.0| 41.0| 48.0| 270.0| 0.8441|0.8222| 0.833| -| Procedure| 269.0|117.0|116.0| 385.0| 0.6969|0.6987|0.6978| -| ImagingTest| 263.0| 50.0| 43.0| 306.0| 0.8403|0.8595|0.8498| -| Symptom| 498.0|101.0|132.0| 630.0| 0.8314|0.7905|0.8104| -|Disease_Syndrome_...|1180.0|258.0|200.0|1380.0| 0.8206|0.8551|0.8375| -| Units| 269.0| 10.0| 2.0| 271.0| 0.9642|0.9926|0.9782| -| Imaging_Technique| 140.0| 38.0| 25.0| 165.0| 0.7865|0.8485|0.8163| -+--------------------+------+-----+-----+------+---------+------+------+ - -+------------------+ -| macro| -+------------------+ -|0.7524248724038437| -+------------------+ - -+------------------+ -| micro| -+------------------+ -|0.8315240382681794| -+------------------+ -``` +--- +layout: model +title: Detect Radiology Related Entities +author: John Snow Labs +name: ner_radiology +date: 2021-01-18 +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 2.7.0 +spark_version: 2.4 +tags: [en, ner, licensed, clinical] +supported: true +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained named entity recognition deep learning model for radiology related texts and reports. + +## Predicted Entities + +`ImagingTest`, `Imaging_Technique`, `ImagingFindings`, `OtherFindings`, `BodyPart`, `Direction`, `Test`, `Symptom`, `Disease_Syndrome_Disorder`, `Medical_Device`, `Procedure`, `Measurements`, `Units` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_RADIOLOGY/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_radiology_en_2.7.0_2.4_1610995075088.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +Use as part of an NLP pipeline with the following stages: DocumentAssembler, SentenceDetector, Tokenizer, WordEmbeddingsModel, NerDLModel. Add the NerConverter to the end of the pipeline to convert entity tokens into full entity chunks. + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") \ + .setInputCols(["document"]) \ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +radiology_ner = NerDLModel.pretrained("ner_radiology", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("entities") + +nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, radiology_ner, ner_converter]) + +data = spark.createDataFrame([["Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial aspect of the left shoulder. This mass demonstrates isoechoic echotexture to the adjacent muscle, with no evidence of internal color flow. This may represent benign fibrous tissue or a lipoma."]]).toDF("text") + +results = nlpPipeline.fit(data).transform(data) +``` + +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val radiology_ner = NerDLModel().pretrained("ner_radiology", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("entities") + +val nlpPipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, radiology_ner, ner_converter)) + +val data = Seq("""Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial aspect of the left shoulder. This mass demonstrates isoechoic echotexture to the adjacent muscle, with no evidence of internal color flow. This may represent benign fibrous tissue or a lipoma.""").toDS.toDF("text") + +val result = nlpPipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.med_ner.radiology").predict("""Bilateral breast ultrasound was subsequently performed, which demonstrated an ovoid mass measuring approximately 0.5 x 0.5 x 0.4 cm in diameter located within the anteromedial aspect of the left shoulder. This mass demonstrates isoechoic echotexture to the adjacent muscle, with no evidence of internal color flow. This may represent benign fibrous tissue or a lipoma.""") +``` + +
+ +## Results + +```bash +| | chunks | entities | +|----|-----------------------|---------------------------| +| 0 | Bilateral | Direction | +| 1 | breast | BodyPart | +| 2 | ultrasound | ImagingTest | +| 3 | ovoid mass | ImagingFindings | +| 4 | 0.5 x 0.5 x 0.4 | Measurements | +| 5 | cm | Units | +| 6 | anteromedial aspect | Direction | +| 7 | left | Direction | +| 8 | shoulder | BodyPart | +| 9 | mass | ImagingFindings | +| 10 | isoechoic echotexture | ImagingFindings | +| 11 | muscle | BodyPart | +| 12 | internal color flow | ImagingFindings | +| 13 | benign fibrous tissue | ImagingFindings | +| 14 | lipoma | Disease_Syndrome_Disorder | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_radiology| +|Type:|ner| +|Compatibility:|Spark NLP 2.7.0+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Dependencies:|embeddings_clinical| + +## Data Source + +Trained on a custom dataset comprising of MIMIC-CXR and MT Radiology texts + +## Benchmarking + +```bash + label tp fp fn total precision recall f1 + OtherFindings 8.0 15.0 63.0 71.0 0.3478 0.1127 0.1702 + Measurements 481.0 30.0 15.0 496.0 0.9413 0.9698 0.9553 + Direction 650.0 137.0 94.0 744.0 0.8259 0.8737 0.8491 + ImagingFindings 1345.0 355.0 324.0 1669.0 0.7912 0.8059 0.7985 + BodyPart 1942.0 335.0 290.0 2232.0 0.8529 0.8701 0.8614 + Medical_Device 236.0 75.0 64.0 300.0 0.7588 0.7867 0.7725 + Test 222.0 41.0 48.0 270.0 0.8441 0.8222 0.833 + Procedure 269.0 117.0 116.0 385.0 0.6969 0.6987 0.6978 + ImagingTest 263.0 50.0 43.0 306.0 0.8403 0.8595 0.8498 + Symptom 498.0 101.0 132.0 630.0 0.8314 0.7905 0.8104 +Disease_Syndrome_... 1180.0 258.0 200.0 1380.0 0.8206 0.8551 0.8375 + Units 269.0 10.0 2.0 271.0 0.9642 0.9926 0.9782 + Imaging_Technique 140.0 38.0 25.0 165.0 0.7865 0.8485 0.8163 + macro - - - - - - 0.7524 + micro - - - - - - 0.8315 +``` diff --git a/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_directions_en.md b/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_directions_en.md index 9c18ee0e61e77..3d08ec5d68af1 100644 --- a/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_directions_en.md +++ b/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_directions_en.md @@ -1,143 +1,199 @@ ---- -layout: model -title: Relation extraction between body parts and direction entities -author: John Snow Labs -name: re_bodypart_directions -date: 2021-01-18 -task: Relation Extraction -language: en -edition: Healthcare NLP 2.7.1 -spark_version: 2.4 -tags: [en, relation_extraction, clinical, licensed] -supported: true -article_header: - type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -Relation extraction between body parts entites [Internal_organ_or_component, External_body_part_or_region] and Direction entity in clinical texts. `1` : Shows there is a relation between the body part entity and the direction entity, `0` : Shows there is no relation between the body part entity and the direction entity. - -## Predicted Entities - -`0`, `1` - -{:.btn-box} - -[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb#scrollTo=D8TtVuN-Ee8s){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/re_bodypart_directions_en_2.7.1_2.4_1610983817042.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - -In the table below, `re_bodypart_directions` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. - - - -| RE MODEL | RE MODEL LABELS | NER MODEL | RE PAIRS | -|:----------------------:|:---------------:|:---------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| -| re_bodypart_directions | 0,1 | ner_jsl | [“direction-external_body_part_or_region”,
“external_body_part_or_region-direction”,
“direction-internal_organ_or_component”,
“internal_organ_or_component-direction”] | - - - - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} -```python - -words_embedder = WordEmbeddingsModel()\ - .pretrained("embeddings_clinical", "en", "clinical/models")\ - .setInputCols(["sentences", "tokens"])\ - .setOutputCol("embeddings") - -ner_tagger = sparknlp.annotators.NerDLModel()\ - .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models")\ - .setInputCols("sentences", "tokens", "embeddings")\ - .setOutputCol("ner_tags") - -pair_list = ['direction-internal_organ_or_component', 'internal_organ_or_component-direction'] - -re_model = RelationExtractionModel().pretrained("re_bodypart_directions","en","clinical/models")\ - .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\ - .setOutputCol("relations")\ - .setMaxSyntacticDistance(4)\ - .setRelationPairs(pair_list) - - -pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, words_embedder, pos_tagger, ner_tagger, ner_chunker, dependency_parser, re_model]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - -results = LightPipeline(model).fullAnnotate(''' MRI demonstrated infarction in the upper brain stem , left cerebellum and right basil ganglia ''') -``` - -```scala -... -val words_embedder = WordEmbeddingsModel() - .pretrained("embeddings_clinical", "en", "clinical/models") - .setInputCols(Array("sentences", "tokens")) - .setOutputCol("embeddings") - -val ner_tagger = sparknlp.annotators.NerDLModel() - .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") - .setInputCols("sentences", "tokens", "embeddings") - .setOutputCol("ner_tags") - -val pair_list = Array('direction-internal_organ_or_component', 'internal_organ_or_component-direction') - -val re_model = RelationExtractionModel().pretrained("re_bodypart_directions","en","clinical/models") - .setInputCols(Array("embeddings", "pos_tags", "ner_chunks", "dependencies")) - .setOutputCol("relations") - .setMaxSyntacticDistance(4) - .setRelationPairs(pair_list) - -val nlpPipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, words_embedder, pos_tagger, ner_tagger, ner_chunker, dependency_parser, re_model)) -val result = pipeline.fit(Seq.empty[String]).transform(data) - -``` - - -
- -## Results - -```bash -| index | relations | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_end | entity2_end | chunk2 | confidence | -|-------|-----------|-----------------------------|---------------|-------------|------------|-----------------------------|-------------|-------------|---------------|------------| -| 0 | 1 | Direction | 35 | 39 | upper | Internal_organ_or_component | 41 | 50 | brain stem | 0.9999989 | -| 1 | 0 | Direction | 35 | 39 | upper | Internal_organ_or_component | 59 | 68 | cerebellum | 0.99992585 | -| 2 | 0 | Direction | 35 | 39 | upper | Internal_organ_or_component | 81 | 93 | basil ganglia | 0.9999999 | -| 3 | 0 | Internal_organ_or_component | 41 | 50 | brain stem | Direction | 54 | 57 | left | 0.999811 | -| 4 | 0 | Internal_organ_or_component | 41 | 50 | brain stem | Direction | 75 | 79 | right | 0.9998203 | -| 5 | 1 | Direction | 54 | 57 | left | Internal_organ_or_component | 59 | 68 | cerebellum | 1.0 | -| 6 | 0 | Direction | 54 | 57 | left | Internal_organ_or_component | 81 | 93 | basil ganglia | 0.97616416 | -| 7 | 0 | Internal_organ_or_component | 59 | 68 | cerebellum | Direction | 75 | 79 | right | 0.953046 | -| 8 | 1 | Direction | 75 | 79 | right | Internal_organ_or_component | 81 | 93 | basil ganglia | 1.0 | -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|re_bodypart_directions| -|Type:|re| -|Compatibility:|Spark NLP 2.7.1+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[embeddings, pos_tags, train_ner_chunks, dependencies]| -|Output Labels:|[relations]| -|Language:|en| -|Dependencies:|embeddings_clinical| - -## Data Source - -Trained on data gathered and manually annotated by John Snow Labs - -## Benchmarking - -```bash -| relation | recall | precision | f1 | -|----------|--------|-----------|------| -| 0 | 0.87 | 0.9 | 0.88 | -| 1 | 0.99 | 0.99 | 0.99 | -``` +--- +layout: model +title: Relation extraction between body parts and direction entities +author: John Snow Labs +name: re_bodypart_directions +date: 2021-01-18 +task: Relation Extraction +language: en +edition: Spark NLP for Healthcare 2.7.1 +spark_version: 2.4 +tags: [en, relation_extraction, clinical, licensed] +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Relation extraction between body parts entites [Internal_organ_or_component, External_body_part_or_region] and Direction entity in clinical texts. `1` : Shows there is a relation between the body part entity and the direction entity, `0` : Shows there is no relation between the body part entity and the direction entity. + +## Predicted Entities + +`0`, `1` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb#scrollTo=D8TtVuN-Ee8s){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/re_bodypart_directions_en_2.7.1_2.4_1610983817042.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +In the table below, `re_bodypart_directions` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. + + + +| RE MODEL | RE MODEL LABELS | NER MODEL | RE PAIRS | +|:----------------------:|:---------------:|:---------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| +| re_bodypart_directions | 0,1 | ner_jsl | [“direction-external_body_part_or_region”,
“external_body_part_or_region-direction”,
“direction-internal_organ_or_component”,
“internal_organ_or_component-direction”] | + + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +words_embedder = WordEmbeddingsModel()\ + .pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("embeddings") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +ner_tagger = MedicalNerModel()\ + .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_chunker = NerConverterInternal()\ + .setInputCols(["sentences", "tokens", "ner_tags"])\ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel()\ + .pretrained("dependency_conllu", "en")\ + .setInputCols(["sentences", "pos_tags", "tokens"])\ + .setOutputCol("dependencies") + +pair_list = ['direction-internal_organ_or_component', 'internal_organ_or_component-direction'] + +re_model = RelationExtractionModel().pretrained("re_bodypart_directions","en","clinical/models")\ + .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\ + .setOutputCol("relations")\ + .setMaxSyntacticDistance(4)\ + .setRelationPairs(pair_list) + + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, words_embedder, pos_tagger, ner_tagger, ner_chunker, dependency_parser, re_model]) + +model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) + +results = LightPipeline(model).fullAnnotate(''' MRI demonstrated infarction in the upper brain stem , left cerebellum and right basil ganglia ''') +``` + +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val words_embedder = WordEmbeddingsModel() + .pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val ner_tagger = sparknlp.annotators.NerDLModel() + .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") + .setInputCols("sentences", "tokens", "embeddings") + .setOutputCol("ner_tags") + +val ner_chunker = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +val pair_list = Array("direction-internal_organ_or_component", "internal_organ_or_component-direction") + +val re_model = RelationExtractionModel().pretrained("re_bodypart_directions","en","clinical/models") + .setInputCols(Array("embeddings", "pos_tags", "ner_chunks", "dependencies")) + .setOutputCol("relations") + .setMaxSyntacticDistance(4) + .setRelationPairs(pair_list) + +val nlpPipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, words_embedder, pos_tagger, ner_tagger, ner_chunker, dependency_parser, re_model)) + +val text = """ MRI demonstrated infarction in the upper brain stem , left cerebellum and right basil ganglia """ + +val data = Seq(text).toDS.toDF("text") + +val results = pipeline.fit(data).transform(data) + +``` + + +
+ +## Results + +```bash +| index | relations | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_end | entity2_end | chunk2 | confidence | +|-------|-----------|-----------------------------|---------------|-------------|------------|-----------------------------|-------------|-------------|---------------|------------| +| 0 | 1 | Direction | 35 | 39 | upper | Internal_organ_or_component | 41 | 50 | brain stem | 0.9999989 | +| 1 | 0 | Direction | 35 | 39 | upper | Internal_organ_or_component | 59 | 68 | cerebellum | 0.99992585 | +| 2 | 0 | Direction | 35 | 39 | upper | Internal_organ_or_component | 81 | 93 | basil ganglia | 0.9999999 | +| 3 | 0 | Internal_organ_or_component | 41 | 50 | brain stem | Direction | 54 | 57 | left | 0.999811 | +| 4 | 0 | Internal_organ_or_component | 41 | 50 | brain stem | Direction | 75 | 79 | right | 0.9998203 | +| 5 | 1 | Direction | 54 | 57 | left | Internal_organ_or_component | 59 | 68 | cerebellum | 1.0 | +| 6 | 0 | Direction | 54 | 57 | left | Internal_organ_or_component | 81 | 93 | basil ganglia | 0.97616416 | +| 7 | 0 | Internal_organ_or_component | 59 | 68 | cerebellum | Direction | 75 | 79 | right | 0.953046 | +| 8 | 1 | Direction | 75 | 79 | right | Internal_organ_or_component | 81 | 93 | basil ganglia | 1.0 | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|re_bodypart_directions| +|Type:|re| +|Compatibility:|Spark NLP 2.7.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[embeddings, pos_tags, train_ner_chunks, dependencies]| +|Output Labels:|[relations]| +|Language:|en| +|Dependencies:|embeddings_clinical| + +## Data Source + +Trained on data gathered and manually annotated by John Snow Labs + +## Benchmarking + +```bash +label recall precision f1 +0 0.87 0.9 0.88 +1 0.99 0.99 0.99 +``` diff --git a/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_problem_en.md b/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_problem_en.md index 1e1a6093db513..adf728c1f72cc 100644 --- a/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_problem_en.md +++ b/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_problem_en.md @@ -1,119 +1,177 @@ ---- -layout: model -title: Relation extraction between body parts and problem entities -author: John Snow Labs -name: re_bodypart_problem -date: 2021-01-18 -task: Relation Extraction -language: en -edition: Healthcare NLP 2.7.1 -spark_version: 2.4 -tags: [en, clinical, relation_extraction, licensed] -supported: true -article_header: - type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -Relation extraction between body parts and problem entities in clinical texts. `1` : Shows that there is a relation between the body part entity and the entities labeled as problem ( diagnosis, symptom etc.), `0` : Shows that there no relation between the body part entity and the entities labeled as problem ( diagnosis, symptom etc.). - -## Predicted Entities - -`0`, `1` - -{:.btn-box} -[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_BODYPART_ENT/){:.button.button-orange} -[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/re_bodypart_problem_en_2.7.1_2.4_1610959377894.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - -In the table below, `re_bodypart_problem` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. - -| RE MODEL | RE MODELS LABES | NER MODEL | RE PAIRS | -|:-------------------:|:---------------:|:---------:|| -| re_bodypart_problem | 0,1 | ner_jsl | [“internal_organ_or_component-cerebrovascular_disease”,
“cerebrovascular_disease-internal_organ_or_component”,
“internal_organ_or_component-communicable_disease”,
“communicable_disease-internal_organ_or_component”,
“internal_organ_or_component-diabetes”,
“diabetes-internal_organ_or_component”,
“internal_organ_or_component-disease_syndrome_disorder”,
“disease_syndrome_disorder-internal_organ_or_component”,
“internal_organ_or_component-ekg_findings”,
“ekg_findings-internal_organ_or_component”,
“internal_organ_or_component-heart_disease”,
“heart_disease-internal_organ_or_component”,
“internal_organ_or_component-hyperlipidemia”,
“hyperlipidemia-internal_organ_or_component”,
“internal_organ_or_component-hypertension”,
“hypertension-internal_organ_or_component”,
“internal_organ_or_component-imagingfindings”,
“imagingfindings-internal_organ_or_component”,
“internal_organ_or_component-injury_or_poisoning”,
“injury_or_poisoning-internal_organ_or_component”,
“internal_organ_or_component-kidney_disease”,
“kidney_disease-internal_organ_or_component”,
“internal_organ_or_component-oncological”,
“oncological-internal_organ_or_component”,
“internal_organ_or_component-psychological_condition”,
“psychological_condition-internal_organ_or_component”,
“internal_organ_or_component-symptom”,
“symptom-internal_organ_or_component”,
“internal_organ_or_component-vs_finding”,
“vs_finding-internal_organ_or_component”,
“external_body_part_or_region-communicable_disease”,
“communicable_disease-external_body_part_or_region”,
“external_body_part_or_region-diabetes”,
“diabetes-external_body_part_or_region”,
“external_body_part_or_region-disease_syndrome_disorder”,
“disease_syndrome_disorder-external_body_part_or_region”,
“external_body_part_or_region-hypertension”,
“hypertension-external_body_part_or_region”,
“external_body_part_or_region-imagingfindings”,
“imagingfindings-external_body_part_or_region”,
“external_body_part_or_region-injury_or_poisoning”,
“injury_or_poisoning-external_body_part_or_region”,
“external_body_part_or_region-obesity”,
“obesity-external_body_part_or_region”,
“external_body_part_or_region-oncological”,
“oncological-external_body_part_or_region”,
“external_body_part_or_region-overweight”,
“overweight-external_body_part_or_region”,
“external_body_part_or_region-symptom”,
“symptom-external_body_part_or_region”,
“external_body_part_or_region-vs_finding”,
“vs_finding-external_body_part_or_region”] | - - - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} -```python - -ner_tagger = sparknlp.annotators.NerDLModel()\ - .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models")\ - .setInputCols("sentences", "tokens", "embeddings")\ - .setOutputCol("ner_tags") - -reModel = RelationExtractionModel.pretrained("re_bodypart_problem","en","clinical/models")\ - .setInputCols(["word_embeddings","chunk","pos","dependency"])\ - .setOutput("relations") \ - .setRelationPairs(['symptom-external_body_part_or_region']) - -pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, words_embedder, pos_tagger, ner_tagger, ner_chunker, dependency_parser, reModel) - -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - -results = LightPipeline(model).fullAnnotate('''No neurologic deficits other than some numbness in his left hand.''') -``` - -```scala -... -val ner_tagger = sparknlp.annotators.NerDLModel() - .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") - .setInputCols("sentences", "tokens", "embeddings") - .setOutputCol("ner_tags") - -val reModel = RelationExtractionModel().pretrained("re_bodypart_problem","en","clinical/models") - .setInputCols(Array("word_embeddings","chunk","pos","dependency")) - .setOutput("relations") - .setRelationPairs(Array('symptom-external_body_part_or_region')) - -val nlpPipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, words_embedder, pos_tagger, ner_tagger, ner_chunker, dependency_parser, reModel)) -val result = pipeline.fit(Seq.empty[String]).transform(data) - -val results = LightPipeline(model).fullAnnotate('''No neurologic deficits other than some numbness in his left hand.''') -``` - -
- -## Results - -```bash -| index | relations | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_end | entity2_end | chunk2 | confidence | -|-------|-----------|---------|---------------|-------------|---------------------|------------------------------|-------------|-------------|--------|------------| -| 0 | 0 | Symptom | 3 | 21 | neurologic deficits | external_body_part_or_region | 60 | 63 | hand | 0.999998 | -| 1 | 1 | Symptom | 39 | 46 | numbness | external_body_part_or_region | 60 | 63 | hand | 1 | - -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|re_bodypart_problem| -|Type:|re| -|Compatibility:|Spark NLP 2.7.1+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[embeddings, pos_tags, train_ner_chunks, dependencies]| -|Output Labels:|[relations]| -|Language:|en| -|Dependencies:|embeddings_clinical| - -## Data Source - -Trained on custom datasets annotated internally - -## Benchmarking - -```bash -| relation | recall | precision | -|----------|--------|-----------| -| 0 | 0.72 | 0.82 | -| 1 | 0.94 | 0.91 | - -``` +--- +layout: model +title: Relation extraction between body parts and problem entities +author: John Snow Labs +name: re_bodypart_problem +date: 2021-01-18 +task: Relation Extraction +language: en +edition: Spark NLP for Healthcare 2.7.1 +spark_version: 2.4 +tags: [en, clinical, relation_extraction, licensed] +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Relation extraction between body parts and problem entities in clinical texts. `1` : Shows that there is a relation between the body part entity and the entities labeled as problem ( diagnosis, symptom etc.), `0` : Shows that there no relation between the body part entity and the entities labeled as problem ( diagnosis, symptom etc.). + +## Predicted Entities + +`0`, `1` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_BODYPART_ENT/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/re_bodypart_problem_en_2.7.1_2.4_1610959377894.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +In the table below, `re_bodypart_problem` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. + +| RE MODEL | RE MODELS LABES | NER MODEL | RE PAIRS | +|:-------------------:|:---------------:|:---------:|| +| re_bodypart_problem | 0,1 | ner_jsl | [“internal_organ_or_component-cerebrovascular_disease”,
“cerebrovascular_disease-internal_organ_or_component”,
“internal_organ_or_component-communicable_disease”,
“communicable_disease-internal_organ_or_component”,
“internal_organ_or_component-diabetes”,
“diabetes-internal_organ_or_component”,
“internal_organ_or_component-disease_syndrome_disorder”,
“disease_syndrome_disorder-internal_organ_or_component”,
“internal_organ_or_component-ekg_findings”,
“ekg_findings-internal_organ_or_component”,
“internal_organ_or_component-heart_disease”,
“heart_disease-internal_organ_or_component”,
“internal_organ_or_component-hyperlipidemia”,
“hyperlipidemia-internal_organ_or_component”,
“internal_organ_or_component-hypertension”,
“hypertension-internal_organ_or_component”,
“internal_organ_or_component-imagingfindings”,
“imagingfindings-internal_organ_or_component”,
“internal_organ_or_component-injury_or_poisoning”,
“injury_or_poisoning-internal_organ_or_component”,
“internal_organ_or_component-kidney_disease”,
“kidney_disease-internal_organ_or_component”,
“internal_organ_or_component-oncological”,
“oncological-internal_organ_or_component”,
“internal_organ_or_component-psychological_condition”,
“psychological_condition-internal_organ_or_component”,
“internal_organ_or_component-symptom”,
“symptom-internal_organ_or_component”,
“internal_organ_or_component-vs_finding”,
“vs_finding-internal_organ_or_component”,
“external_body_part_or_region-communicable_disease”,
“communicable_disease-external_body_part_or_region”,
“external_body_part_or_region-diabetes”,
“diabetes-external_body_part_or_region”,
“external_body_part_or_region-disease_syndrome_disorder”,
“disease_syndrome_disorder-external_body_part_or_region”,
“external_body_part_or_region-hypertension”,
“hypertension-external_body_part_or_region”,
“external_body_part_or_region-imagingfindings”,
“imagingfindings-external_body_part_or_region”,
“external_body_part_or_region-injury_or_poisoning”,
“injury_or_poisoning-external_body_part_or_region”,
“external_body_part_or_region-obesity”,
“obesity-external_body_part_or_region”,
“external_body_part_or_region-oncological”,
“oncological-external_body_part_or_region”,
“external_body_part_or_region-overweight”,
“overweight-external_body_part_or_region”,
“external_body_part_or_region-symptom”,
“symptom-external_body_part_or_region”,
“external_body_part_or_region-vs_finding”,
“vs_finding-external_body_part_or_region”] | + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("embeddings") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +ner_tagger = MedicalNerModel()\ + .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_chunker = NerConverterInternal()\ + .setInputCols(["sentences", "tokens", "ner_tags"])\ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel()\ + .pretrained("dependency_conllu", "en")\ + .setInputCols(["sentences", "pos_tags", "tokens"])\ + .setOutputCol("dependencies") + +reModel = RelationExtractionModel.pretrained("re_bodypart_problem","en","clinical/models")\ + .setInputCols(["embeddings","ner_chunks","pos_tags","dependencies"])\ + .setOutputCol("relations") \ + .setRelationPairs(['symptom-external_body_part_or_region']) + +pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, word_embeddings, pos_tagger, ner_tagger, ner_chunker, dependency_parser, reModel]) + +model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) + +results = LightPipeline(model).fullAnnotate('''No neurologic deficits other than some numbness in his left hand.''') +``` + +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val ner_tagger = MedicalNerModel() + .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_chunker = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +val reModel = RelationExtractionModel().pretrained("re_bodypart_problem","en","clinical/models") + .setInputCols(Array("embeddings","ner_chunks","pos_tags","dependencies")) + .setOutput("relations") + .setRelationPairs(Array("symptom-external_body_part_or_region")) + +val nlpPipeline = new Pipeline().setStages(Array(documenter, sentencer, tokenizer, word_embeddings, pos_tagger, ner_tagger, ner_chunker, dependency_parser, reModel)) + +val result = pipeline.fit(Seq.empty[String]).transform(data) + +val results = LightPipeline(model).fullAnnotate("""No neurologic deficits other than some numbness in his left hand.""") +``` + +
+ +## Results + +```bash +| index | relations | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_end | entity2_end | chunk2 | confidence | +|-------|-----------|---------|---------------|-------------|---------------------|------------------------------|-------------|-------------|--------|------------| +| 0 | 0 | Symptom | 3 | 21 | neurologic deficits | external_body_part_or_region | 60 | 63 | hand | 0.999998 | +| 1 | 1 | Symptom | 39 | 46 | numbness | external_body_part_or_region | 60 | 63 | hand | 1 | + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|re_bodypart_problem| +|Type:|re| +|Compatibility:|Spark NLP 2.7.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[embeddings, pos_tags, train_ner_chunks, dependencies]| +|Output Labels:|[relations]| +|Language:|en| +|Dependencies:|embeddings_clinical| + +## Data Source + +Trained on custom datasets annotated internally + +## Benchmarking + +```bash +label recall precision +0 0.72 0.82 +1 0.94 0.91 +``` diff --git a/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_proceduretest_en.md b/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_proceduretest_en.md index 0a3c6583489ee..84ad8aac46aad 100644 --- a/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_proceduretest_en.md +++ b/docs/_posts/aydinmyilmaz/2021-01-18-re_bodypart_proceduretest_en.md @@ -1,126 +1,187 @@ ---- -layout: model -title: Relation extraction between body parts and procedures -author: John Snow Labs -name: re_bodypart_proceduretest -date: 2021-01-18 -task: Relation Extraction -language: en -edition: Healthcare NLP 2.7.1 -spark_version: 2.4 -tags: [en, relation_extraction, clinical, licensed] -supported: true -article_header: - type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -Relation extraction between body parts entites ['Internal_organ_or_component','External_body_part_or_region'] and procedure and test entities - -## Predicted Entities - -`0`, `1` - -{:.btn-box} -[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_BODYPART_ENT/){:.button.button-orange} -[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb#scrollTo=D8TtVuN-Ee8s){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/re_bodypart_proceduretest_en_2.7.1_2.4_1610989267602.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - -In the table below, `re_bodypart_proceduretest` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. - - - -| RE MODEL | RE MODEL LABES | NER MODEL | RE PAIRS | -|:-------------------------:|:--------------:|:---------:|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| re_bodypart_proceduretest | 0,1 | ner_jsl | [“external_body_part_or_region-test”,
“test-external_body_part_or_region”,
“internal_organ_or_component-test”,
“test-internal_organ_or_component”,
“external_body_part_or_region-procedure”,
“procedure-external_body_part_or_region”,
“procedure-internal_organ_or_component”,
“internal_organ_or_component-procedure”] | - - - - -Use as part of an nlp pipeline with the following stages: DocumentAssembler, SentenceDetector, Tokenizer, PerceptronModel, DependencyParserModel, WordEmbeddingsModel, NerDLModel, NerConverter, RelationExtractionModel. - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} -```python - -ner_tagger = sparknlp.annotators.NerDLModel()\ - .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models")\ - .setInputCols("sentences", "tokens", "embeddings")\ - .setOutputCol("ner_tags") - -re_model = RelationExtractionModel()\ - .pretrained("re_bodypart_proceduretest", "en", 'clinical/models')\ - .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\ - .setOutputCol("relations")\ - .setMaxSyntacticDistance(4)\ #default: 0 - .setPredictionThreshold(0.9)\ #default: 0.5 - .setRelationPairs(["external_body_part_or_region-test"]) # Possible relation pairs. Default: All Relations. - -nlp_pipeline = Pipeline(stages=[ documenter, sentencer,tokenizer, words_embedder, pos_tagger, clinical_ner_tagger,ner_chunker, dependency_parser,re_model]) - -light_pipeline = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))) - -annotations = light_pipeline.fullAnnotate(''''TECHNIQUE IN DETAIL: After informed consent was obtained from the patient and his mother, the chest was scanned with portable ultrasound.'''') -``` - -```scala -... -val ner_tagger = sparknlp.annotators.NerDLModel().pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") - .setInputCols("sentences", "tokens", "embeddings") - .setOutputCol("ner_tags") - -val re_model = RelationExtractionModel().pretrained("re_bodypart_proceduretest", "en", 'clinical/models') - .setInputCols(Array("embeddings", "pos_tags", "ner_chunks", "dependencies")) - .setOutputCol("relations") - .setMaxSyntacticDistance(4) #default: 0 - .setPredictionThreshold(0.9) #default: 0.5 - .setRelationPairs(Array("external_body_part_or_region-test")) # Possible relation pairs. Default: All Relations. - -val nlpPipeline = new Pipeline().setStages(Array(documenter, sentencer,tokenizer, words_embedder, pos_tagger, clinical_ner_tagger,ner_chunker, dependency_parser,re_model)) -val result = pipeline.fit(Seq.empty[String]).transform(data) - -val annotations = light_pipeline.fullAnnotate(''''TECHNIQUE IN DETAIL: After informed consent was obtained from the patient and his mother, the chest was scanned with portable ultrasound.'''') -``` - -
- -## Results - -```bash -| index | relations | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_end | entity2_end | chunk2 | confidence | -|-------|-----------|------------------------------|---------------|-------------|--------|---------|-------------|-------------|---------------------|------------| -| 0 | 1 | External_body_part_or_region | 94 | 98 | chest | Test | 117 | 135 | portable ultrasound | 1.0 | -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|re_bodypart_proceduretest| -|Type:|re| -|Compatibility:|Spark NLP 2.7.1+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[embeddings, pos_tags, train_ner_chunks, dependencies]| -|Output Labels:|[relations]| -|Language:|en| -|Dependencies:|embeddings_clinical| - -## Data Source - -Trained on data gathered and manually annotated by John Snow Labs - -## Benchmarking - -```bash -| relation | recall | precision | f1 | -|----------|--------|-----------|------| -| 0 | 0.55 | 0.35 | 0.43 | -| 1 | 0.73 | 0.86 | 0.79 | - -``` +--- +layout: model +title: Relation extraction between body parts and procedures +author: John Snow Labs +name: re_bodypart_proceduretest +date: 2021-01-18 +task: Relation Extraction +language: en +edition: Spark NLP for Healthcare 2.7.1 +spark_version: 2.4 +tags: [en, relation_extraction, clinical, licensed] +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Relation extraction between body parts entites ['Internal_organ_or_component','External_body_part_or_region'] and procedure and test entities + +## Predicted Entities + +`0`, `1` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/RE_BODYPART_ENT/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb#scrollTo=D8TtVuN-Ee8s){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/re_bodypart_proceduretest_en_2.7.1_2.4_1610989267602.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +In the table below, `re_bodypart_proceduretest` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. + + + +| RE MODEL | RE MODEL LABES | NER MODEL | RE PAIRS | +|:-------------------------:|:--------------:|:---------:|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| re_bodypart_proceduretest | 0,1 | ner_jsl | [“external_body_part_or_region-test”,
“test-external_body_part_or_region”,
“internal_organ_or_component-test”,
“test-internal_organ_or_component”,
“external_body_part_or_region-procedure”,
“procedure-external_body_part_or_region”,
“procedure-internal_organ_or_component”,
“internal_organ_or_component-procedure”] | + + + + +Use as part of an nlp pipeline with the following stages: DocumentAssembler, SentenceDetector, Tokenizer, PerceptronModel, DependencyParserModel, WordEmbeddingsModel, NerDLModel, NerConverter, RelationExtractionModel. + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("embeddings") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +ner_tagger = MedicalNerModel()\ + .pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_chunker = NerConverterInternal()\ + .setInputCols(["sentences", "tokens", "ner_tags"])\ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel()\ + .pretrained("dependency_conllu", "en")\ + .setInputCols(["sentences", "pos_tags", "tokens"])\ + .setOutputCol("dependencies") + +re_model = RelationExtractionModel()\ + .pretrained("re_bodypart_proceduretest", "en", "clinical/models")\ + .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\ + .setOutputCol("relations")\ + .setMaxSyntacticDistance(4)\ + .setPredictionThreshold(0.9)\ + .setRelationPairs(["external_body_part_or_region-test"]) # Possible relation pairs. Default: All Relations. + +nlp_pipeline = Pipeline(stages=[documenter, sentencer,tokenizer, word_embeddings, pos_tagger, ner_tagger, ner_chunker, dependency_parser, re_model]) + +light_pipeline = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))) + +annotations = light_pipeline.fullAnnotate('''TECHNIQUE IN DETAIL: After informed consent was obtained from the patient and his mother, the chest was scanned with portable ultrasound.''') +``` + +```scala + +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val ner_tagger = MedicalNerModel().pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_chunker = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +val re_model = RelationExtractionModel().pretrained("re_bodypart_proceduretest", "en", "clinical/models") + .setInputCols(Array("embeddings", "pos_tags", "ner_chunks", "dependencies")) + .setOutputCol("relations") + .setMaxSyntacticDistance(4) #default: 0 + .setPredictionThreshold(0.9) #default: 0.5 + .setRelationPairs(Array("external_body_part_or_region-test")) # Possible relation pairs. Default: All Relations. + +val nlpPipeline = new Pipeline().setStages(Array(documenter, sentencer,tokenizer, word_embeddings, pos_tagger, ner_tagger, ner_chunker, dependency_parser, re_model)) + +val result = pipeline.fit(Seq.empty[String]).transform(data) + +val annotations = light_pipeline.fullAnnotate("""TECHNIQUE IN DETAIL: After informed consent was obtained from the patient and his mother, the chest was scanned with portable ultrasound.""") +``` + +
+ +## Results + +```bash +| index | relations | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_end | entity2_end | chunk2 | confidence | +|-------|-----------|------------------------------|---------------|-------------|--------|---------|-------------|-------------|---------------------|------------| +| 0 | 1 | External_body_part_or_region | 94 | 98 | chest | Test | 117 | 135 | portable ultrasound | 1.0 | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|re_bodypart_proceduretest| +|Type:|re| +|Compatibility:|Spark NLP 2.7.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[embeddings, pos_tags, train_ner_chunks, dependencies]| +|Output Labels:|[relations]| +|Language:|en| +|Dependencies:|embeddings_clinical| + +## Data Source + +Trained on data gathered and manually annotated by John Snow Labs + +## Benchmarking + +```bash +label recall precision f1 +0 0.55 0.35 0.43 +1 0.73 0.86 0.79 + +``` + diff --git a/docs/_posts/aydinmyilmaz/2021-01-18-re_date_clinical_en.md b/docs/_posts/aydinmyilmaz/2021-01-18-re_date_clinical_en.md index 742f8eaba3578..066f8adf50f71 100644 --- a/docs/_posts/aydinmyilmaz/2021-01-18-re_date_clinical_en.md +++ b/docs/_posts/aydinmyilmaz/2021-01-18-re_date_clinical_en.md @@ -1,120 +1,180 @@ ---- -layout: model -title: Relation extraction between dates and clinical entities -author: John Snow Labs -name: re_date_clinical -date: 2021-01-18 -task: Relation Extraction -language: en -edition: Healthcare NLP 2.7.1 -spark_version: 2.4 -tags: [en, relation_extraction, clinical, licensed] -supported: true -article_header: - type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -Relation extraction between date and related other entities. `1` : Shows there is a relation between the date entity and other clinical entities, `0` : Shows there is no relation between the date entity and other clinical entities. - -## Predicted Entities - -`0`, `1` - -{:.btn-box} - -[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/re_date_clinical_en_2.7.1_2.4_1611000334654.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - -In the table below, `re_date_clinical` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. - -| RE MODEL | RE MODEL LABES | NER MODEL | RE PAIRS | -|:----------------:|:--------------:|:---------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| re_date_clinical | 0,1 | ner_jsl | [“date-admission_discharge”,
“admission_discharge-date”,
“date-alcohol”,
“alcohol-date”,
“date-allergen”,
“allergen-date”,
“date-bmi”,
“bmi-date”,
“date-birth_entity”,
“birth_entity-date”,
“date-blood_pressure”,
“blood_pressure-date”,
“date-cerebrovascular_disease”,
“cerebrovascular_disease-date”,
“date-clinical_dept”,
“clinical_dept-date”,
“date-communicable_disease”,
“communicable_disease-date”,
“date-death_entity”,
“death_entity-date”,
“date-diabetes”,
“diabetes-date”,
“date-diet”,
“diet-date”,
“date-disease_syndrome_disorder”,
“disease_syndrome_disorder-date”,
“date-drug_brandname”,
“drug_brandname-date”,
“date-drug_ingredient”,
“drug_ingredient-date”,
“date-ekg_findings”,
“ekg_findings-date”,
“date-external_body_part_or_region”,
“external_body_part_or_region-date”,
“date-fetus_newborn”,
“fetus_newborn-date”,
“date-hdl”,
“hdl-date”,
“date-heart_disease”,
“heart_disease-date”,
“date-height”,
“height-date”,
“date-hyperlipidemia”,
“hyperlipidemia-date”,
“date-hypertension”,
“hypertension-date”,
“date-imagingfindings”,
“imagingfindings-date”,
“date-imaging_technique”,
“imaging_technique-date”,
“date-injury_or_poisoning”,
“injury_or_poisoning-date”,
“date-internal_organ_or_component”,
“internal_organ_or_component-date”,
“date-kidney_disease”,
“kidney_disease-date”,
“date-ldl”,
“ldl-date”,
“date-modifier”,
“modifier-date”,
“date-o2_saturation”,
“o2_saturation-date”,
“date-obesity”,
“obesity-date”,
“date-oncological”,
“oncological-date”,
“date-overweight”,
“overweight-date”,
“date-oxygen_therapy”,
“oxygen_therapy-date”,
“date-pregnancy”,
“pregnancy-date”,
“date-procedure”,
“procedure-date”,
“date-psychological_condition”,
“psychological_condition-date”,
“date-pulse”,
“pulse-date”,
“date-respiration”,
“respiration-date”,
“date-smoking”,
“smoking-date”,
“date-substance”,
“substance-date”,
“date-substance_quantity”,
“substance_quantity-date”,
“date-symptom”,
“symptom-date”,
“date-temperature”,
“temperature-date”,
“date-test”,
“test-date”,
“date-test_result”,
“test_result-date”,
“date-total_cholesterol”,
“total_cholesterol-date”,
“date-treatment”,
“treatment-date”,
“date-triglycerides”,
“triglycerides-date”,
“date-vs_finding”,
“vs_finding-date”,
“date-vaccine”,
“vaccine-date”,
“date-vital_signs_header”,
“vital_signs_header-date”,
“date-weight”,
“weight-date”,
“time-admission_discharge”,
“admission_discharge-time”,
“time-alcohol”,
“alcohol-time”,
“time-allergen”,
“allergen-time”,
“time-bmi”,
“bmi-time”,
“time-birth_entity”,
“birth_entity-time”,
“time-blood_pressure”,
“blood_pressure-time”,
“time-cerebrovascular_disease”,
“cerebrovascular_disease-time”,
“time-clinical_dept”,
“clinical_dept-time”,
“time-communicable_disease”,
“communicable_disease-time”,
“time-death_entity”,
“death_entity-time”,
“time-diabetes”,
“diabetes-time”,
“time-diet”,
“diet-time”,
“time-disease_syndrome_disorder”,
“disease_syndrome_disorder-time”,
“time-drug_brandname”,
“drug_brandname-time”,
“time-drug_ingredient”,
“drug_ingredient-time”,
“time-ekg_findings”,
“ekg_findings-time”,
“time-external_body_part_or_region”,
“external_body_part_or_region-time”,
“time-fetus_newborn”,
“fetus_newborn-time”,
“time-hdl”,
“hdl-time”,
“time-heart_disease”,
“heart_disease-time”,
“time-height”,
“height-time”,
“time-hyperlipidemia”,
“hyperlipidemia-time”,
“time-hypertension”,
“hypertension-time”,
“time-imagingfindings”,
“imagingfindings-time”,
“time-imaging_technique”,
“imaging_technique-time”,
“time-injury_or_poisoning”,
“injury_or_poisoning-time”,
“time-internal_organ_or_component”,
“internal_organ_or_component-time”,
“time-kidney_disease”,
“kidney_disease-time”,
“time-ldl”,
“ldl-time”,
“time-modifier”,
“modifier-time”,
“time-o2_saturation”,
“o2_saturation-time”,
“time-obesity”,
“obesity-time”,
“time-oncological”,
“oncological-time”,
“time-overweight”,
“overweight-time”,
“time-oxygen_therapy”,
“oxygen_therapy-time”,
“time-pregnancy”,
“pregnancy-time”,
“time-procedure”,
“procedure-time”,
“time-psychological_condition”,
“psychological_condition-time”,
“time-pulse”,
“pulse-time”,
“time-respiration”,
“respiration-time”,
“time-smoking”,
“smoking-time”,
“time-substance”,
“substance-time”,
“time-substance_quantity”,
“substance_quantity-time”,
“time-symptom”,
“symptom-time”,
“time-temperature”,
“temperature-time”,
“time-test”,
“test-time”,
“time-test_result”,
“test_result-time”,
“time-total_cholesterol”,
“total_cholesterol-time”,
“time-treatment”,
“treatment-time”,
“time-triglycerides”,
“triglycerides-time”,
“time-vs_finding”,
“vs_finding-time”,
“time-vaccine”,
“vaccine-time”,
“time-vital_signs_header”,
“vital_signs_header-time”,
“time-weight”,
“weight-time”,
“relativedate-admission_discharge”,
“admission_discharge-relativedate”,
“relativedate-alcohol”,
“alcohol-relativedate”,
“relativedate-allergen”,
“allergen-relativedate”,
“relativedate-bmi”,
“bmi-relativedate”,
“relativedate-birth_entity”,
“birth_entity-relativedate”,
“relativedate-blood_pressure”,
“blood_pressure-relativedate”,
“relativedate-cerebrovascular_disease”,
“cerebrovascular_disease-relativedate”,
“relativedate-clinical_dept”,
“clinical_dept-relativedate”,
“relativedate-communicable_disease”,
“communicable_disease-relativedate”,
“relativedate-death_entity”,
“death_entity-relativedate”,
“relativedate-diabetes”,
“diabetes-relativedate”,
“relativedate-diet”,
“diet-relativedate”,
“relativedate-disease_syndrome_disorder”,
“disease_syndrome_disorder-relativedate”,
“relativedate-drug_brandname”,
“drug_brandname-relativedate”,
“relativedate-drug_ingredient”,
“drug_ingredient-relativedate”,
“relativedate-ekg_findings”,
“ekg_findings-relativedate”,
“relativedate-external_body_part_or_region”,
“external_body_part_or_region-relativedate”,
“relativedate-fetus_newborn”,
“fetus_newborn-relativedate”,
“relativedate-hdl”,
“hdl-relativedate”,
“relativedate-heart_disease”,
“heart_disease-relativedate”,
“relativedate-height”,
“height-relativedate”,
“relativedate-hyperlipidemia”,
“hyperlipidemia-relativedate”,
“relativedate-hypertension”,
“hypertension-relativedate”,
“relativedate-imagingfindings”,
“imagingfindings-relativedate”,
“relativedate-imaging_technique”,
“imaging_technique-relativedate”,
“relativedate-injury_or_poisoning”,
“injury_or_poisoning-relativedate”,
“relativedate-internal_organ_or_component”,
“internal_organ_or_component-relativedate”,
“relativedate-kidney_disease”,
“kidney_disease-relativedate”,
“relativedate-ldl”,
“ldl-relativedate”,
“relativedate-modifier”,
“modifier-relativedate”,
“relativedate-o2_saturation”,
“o2_saturation-relativedate”,
“relativedate-obesity”,
“obesity-relativedate”,
“relativedate-oncological”,
“oncological-relativedate”,
“relativedate-overweight”,
“overweight-relativedate”,
“relativedate-oxygen_therapy”,
“oxygen_therapy-relativedate”,
“relativedate-pregnancy”,
“pregnancy-relativedate”,
“relativedate-procedure”,
“procedure-relativedate”,
“relativedate-psychological_condition”,
“psychological_condition-relativedate”,
“relativedate-pulse”,
“pulse-relativedate”,
“relativedate-respiration”,
“respiration-relativedate”,
“relativedate-smoking”,
“smoking-relativedate”,
“relativedate-substance”,
“substance-relativedate”,
“relativedate-substance_quantity”,
“substance_quantity-relativedate”,
“relativedate-symptom”,
“symptom-relativedate”,
“relativedate-temperature”,
“temperature-relativedate”,
“relativedate-test”,
“test-relativedate”,
“relativedate-test_result”,
“test_result-relativedate”,
“relativedate-total_cholesterol”,
“total_cholesterol-relativedate”,
“relativedate-treatment”,
“treatment-relativedate”,
“relativedate-triglycerides”,
“triglycerides-relativedate”,
“relativedate-vs_finding”,
“vs_finding-relativedate”,
“relativedate-vaccine”,
“vaccine-relativedate”,
“relativedate-vital_signs_header”,
“vital_signs_header-relativedate”,
“relativedate-weight”,
“weight-relativedate”,
“relativetime-admission_discharge”,
“admission_discharge-relativetime”,
“relativetime-alcohol”,
“alcohol-relativetime”,
“relativetime-allergen”,
“allergen-relativetime”,
“relativetime-bmi”,
“bmi-relativetime”,
“relativetime-birth_entity”,
“birth_entity-relativetime”,
“relativetime-blood_pressure”,
“blood_pressure-relativetime”,
“relativetime-cerebrovascular_disease”,
“cerebrovascular_disease-relativetime”,
“relativetime-clinical_dept”,
“clinical_dept-relativetime”,
“relativetime-communicable_disease”,
“communicable_disease-relativetime”,
“relativetime-death_entity”,
“death_entity-relativetime”,
“relativetime-diabetes”,
“diabetes-relativetime”,
“relativetime-diet”,
“diet-relativetime”,
“relativetime-disease_syndrome_disorder”,
“disease_syndrome_disorder-relativetime”,
“relativetime-drug_brandname”,
“drug_brandname-relativetime”,
“relativetime-drug_ingredient”,
“drug_ingredient-relativetime”,
“relativetime-ekg_findings”,
“ekg_findings-relativetime”,
“relativetime-external_body_part_or_region”,
“external_body_part_or_region-relativetime”,
“relativetime-fetus_newborn”,
“fetus_newborn-relativetime”,
“relativetime-hdl”,
“hdl-relativetime”,
“relativetime-heart_disease”,
“heart_disease-relativetime”,
“relativetime-height”,
“height-relativetime”,
“relativetime-hyperlipidemia”,
“hyperlipidemia-relativetime”,
“relativetime-hypertension”,
“hypertension-relativetime”,
“relativetime-imagingfindings”,
“imagingfindings-relativetime”,
“relativetime-imaging_technique”,
“imaging_technique-relativetime”,
“relativetime-injury_or_poisoning”,
“injury_or_poisoning-relativetime”,
“relativetime-internal_organ_or_component”,
“internal_organ_or_component-relativetime”,
“relativetime-kidney_disease”,
“kidney_disease-relativetime”,
“relativetime-ldl”,
“ldl-relativetime”,
“relativetime-modifier”,
“modifier-relativetime”,
“relativetime-o2_saturation”,
“o2_saturation-relativetime”,
“relativetime-obesity”,
“obesity-relativetime”,
“relativetime-oncological”,
“oncological-relativetime”,
“relativetime-overweight”,
“overweight-relativetime”,
“relativetime-oxygen_therapy”,
“oxygen_therapy-relativetime”,
“relativetime-pregnancy”,
“pregnancy-relativetime”,
“relativetime-procedure”,
“procedure-relativetime”,
“relativetime-psychological_condition”,
“psychological_condition-relativetime”,
“relativetime-pulse”,
“pulse-relativetime”,
“relativetime-respiration”,
“respiration-relativetime”,
“relativetime-smoking”,
“smoking-relativetime”,
“relativetime-substance”,
“substance-relativetime”,
“relativetime-substance_quantity”,
“substance_quantity-relativetime”,
“relativetime-symptom”,
“symptom-relativetime”,
“relativetime-temperature”,
“temperature-relativetime”,
“relativetime-test”,
“test-relativetime”,
“relativetime-test_result”,
“test_result-relativetime”,
“relativetime-total_cholesterol”,
“total_cholesterol-relativetime”,
“relativetime-treatment”,
“treatment-relativetime”,
“relativetime-triglycerides”,
“triglycerides-relativetime”,
“relativetime-vs_finding”,
“vs_finding-relativetime”,
“relativetime-vaccine”,
“vaccine-relativetime”,
“relativetime-vital_signs_header”,
“vital_signs_header-relativetime”,
“relativetime-weight”,
“weight-relativetime”] | - - -Use as part of an nlp pipeline with the following stages: DocumentAssembler, SentenceDetector, Tokenizer, PerceptronModel, DependencyParserModel, WordEmbeddingsModel, NerDLModel, NerConverter, RelationExtractionModel. - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} -```python -ner_tagger = sparknlp.annotators.NerDLModel().pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models")\ - .setInputCols("sentences", "tokens", "embeddings")\ - .setOutputCol("ner_tags") - -re_model = RelationExtractionModel() -.pretrained("re_date", "en", 'clinical/models') -.setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"]) -.setOutputCol("relations") -.setMaxSyntacticDistance(3)\ #default: 0 .setPredictionThreshold(0.9)\ #default: 0.5 .setRelationPairs(["test-date", "symptom-date"]) # Possible relation pairs. Default: All Relations. - -nlp_pipeline = Pipeline(stages=[documenter, sentencer,tokenizer, words_embedder, pos_tagger, ner_tagger, ner_chunker, dependency_parser,re_model]) - -light_pipeline = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))) - -annotations = light_pipeline.fullAnnotate('''This 73 y/o patient had CT on 1/12/95, with progressive memory and cognitive decline since 8/11/94.''') -``` - -```scala -... -val ner_tagger = sparknlp.annotators.NerDLModel().pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") - .setInputCols("sentences", "tokens", "embeddings") - .setOutputCol("ner_tags") - -val re_model = RelationExtractionModel() - .pretrained("re_date", "en", 'clinical/models') - .setInputCols(Array("embeddings", "pos_tags", "ner_chunks", "dependencies")) - .setOutputCol("relations") - .setMaxSyntacticDistance(3) #default: 0 - .setPredictionThreshold(0.9) #default: 0.5 - .setRelationPairs(Array("test-date", "symptom-date")) # Possible relation pairs. Default: All Relations. - -val nlpPipeline = new Pipeline().setStages(Array(documenter, sentencer,tokenizer, words_embedder, pos_tagger, ner_tagger, ner_chunker, dependency_parser,re_model)) - -val result = pipeline.fit(Seq.empty[String]).transform(data) - -val annotations = light_pipeline.fullAnnotate('''This 73 y/o patient had CT on 1/12/95, with progressive memory and cognitive decline since 8/11/94.''') -``` - -
- -## Results - -```bash -| | relations | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_end | entity2_end | chunk2 | confidence | -|---|-----------|---------|---------------|-------------|------------------------------------------|---------|-------------|-------------|---------|------------| -| 0 | 1 | Test | 24 | 25 | CT | Date | 31 | 37 | 1/12/95 | 1.0 | -| 1 | 1 | Symptom | 45 | 84 | progressive memory and cognitive decline | Date | 92 | 98 | 8/11/94 | 1.0 | -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|re_date_clinical| -|Type:|re| -|Compatibility:|Spark NLP 2.7.1+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[embeddings, pos_tags, train_ner_chunks, dependencies]| -|Output Labels:|[relations]| -|Language:|en| -|Dependencies:|embeddings_clinical| - -## Data Source - -Trained on data gathered and manually annotated by John Snow Labs - -## Benchmarking - -```bash -| relation | recall | precision | f1 | -|----------|--------|-----------|------| -| 0 | 0.74 | 0.71 | 0.72 | -| 1 | 0.94 | 0.95 | 0.94 | +--- +layout: model +title: Relation extraction between dates and clinical entities +author: John Snow Labs +name: re_date_clinical +date: 2021-01-18 +task: Relation Extraction +language: en +edition: Spark NLP for Healthcare 2.7.1 +spark_version: 2.4 +tags: [en, relation_extraction, clinical, licensed] +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Relation extraction between date and related other entities. `1` : Shows there is a relation between the date entity and other clinical entities, `0` : Shows there is no relation between the date entity and other clinical entities. + +## Predicted Entities + +`0`, `1` + +{:.btn-box} + +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/10.Clinical_Relation_Extraction.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/re_date_clinical_en_2.7.1_2.4_1611000334654.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +In the table below, `re_date_clinical` RE model, its labels, optimal NER model, and meaningful relation pairs are illustrated. + +| RE MODEL | RE MODEL LABES | NER MODEL | RE PAIRS | +|:----------------:|:--------------:|:---------:|| +| re_date_clinical | 0,1 | ner_jsl | [“date-admission_discharge”,
“admission_discharge-date”,
“date-alcohol”,
“alcohol-date”,
“date-allergen”,
“allergen-date”,
“date-bmi”,
“bmi-date”,
“date-birth_entity”,
“birth_entity-date”,
“date-blood_pressure”,
“blood_pressure-date”,
“date-cerebrovascular_disease”,
“cerebrovascular_disease-date”,
“date-clinical_dept”,
“clinical_dept-date”,
“date-communicable_disease”,
“communicable_disease-date”,
“date-death_entity”,
“death_entity-date”,
“date-diabetes”,
“diabetes-date”,
“date-diet”,
“diet-date”,
“date-disease_syndrome_disorder”,
“disease_syndrome_disorder-date”,
“date-drug_brandname”,
“drug_brandname-date”,
“date-drug_ingredient”,
“drug_ingredient-date”,
“date-ekg_findings”,
“ekg_findings-date”,
“date-external_body_part_or_region”,
“external_body_part_or_region-date”,
“date-fetus_newborn”,
“fetus_newborn-date”,
“date-hdl”,
“hdl-date”,
“date-heart_disease”,
“heart_disease-date”,
“date-height”,
“height-date”,
“date-hyperlipidemia”,
“hyperlipidemia-date”,
“date-hypertension”,
“hypertension-date”,
“date-imagingfindings”,
“imagingfindings-date”,
“date-imaging_technique”,
“imaging_technique-date”,
“date-injury_or_poisoning”,
“injury_or_poisoning-date”,
“date-internal_organ_or_component”,
“internal_organ_or_component-date”,
“date-kidney_disease”,
“kidney_disease-date”,
“date-ldl”,
“ldl-date”,
“date-modifier”,
“modifier-date”,
“date-o2_saturation”,
“o2_saturation-date”,
“date-obesity”,
“obesity-date”,
“date-oncological”,
“oncological-date”,
“date-overweight”,
“overweight-date”,
“date-oxygen_therapy”,
“oxygen_therapy-date”,
“date-pregnancy”,
“pregnancy-date”,
“date-procedure”,
“procedure-date”,
“date-psychological_condition”,
“psychological_condition-date”,
“date-pulse”,
“pulse-date”,
“date-respiration”,
“respiration-date”,
“date-smoking”,
“smoking-date”,
“date-substance”,
“substance-date”,
“date-substance_quantity”,
“substance_quantity-date”,
“date-symptom”,
“symptom-date”,
“date-temperature”,
“temperature-date”,
“date-test”,
“test-date”,
“date-test_result”,
“test_result-date”,
“date-total_cholesterol”,
“total_cholesterol-date”,
“date-treatment”,
“treatment-date”,
“date-triglycerides”,
“triglycerides-date”,
“date-vs_finding”,
“vs_finding-date”,
“date-vaccine”,
“vaccine-date”,
“date-vital_signs_header”,
“vital_signs_header-date”,
“date-weight”,
“weight-date”,
“time-admission_discharge”,
“admission_discharge-time”,
“time-alcohol”,
“alcohol-time”,
“time-allergen”,
“allergen-time”,
“time-bmi”,
“bmi-time”,
“time-birth_entity”,
“birth_entity-time”,
“time-blood_pressure”,
“blood_pressure-time”,
“time-cerebrovascular_disease”,
“cerebrovascular_disease-time”,
“time-clinical_dept”,
“clinical_dept-time”,
“time-communicable_disease”,
“communicable_disease-time”,
“time-death_entity”,
“death_entity-time”,
“time-diabetes”,
“diabetes-time”,
“time-diet”,
“diet-time”,
“time-disease_syndrome_disorder”,
“disease_syndrome_disorder-time”,
“time-drug_brandname”,
“drug_brandname-time”,
“time-drug_ingredient”,
“drug_ingredient-time”,
“time-ekg_findings”,
“ekg_findings-time”,
“time-external_body_part_or_region”,
“external_body_part_or_region-time”,
“time-fetus_newborn”,
“fetus_newborn-time”,
“time-hdl”,
“hdl-time”,
“time-heart_disease”,
“heart_disease-time”,
“time-height”,
“height-time”,
“time-hyperlipidemia”,
“hyperlipidemia-time”,
“time-hypertension”,
“hypertension-time”,
“time-imagingfindings”,
“imagingfindings-time”,
“time-imaging_technique”,
“imaging_technique-time”,
“time-injury_or_poisoning”,
“injury_or_poisoning-time”,
“time-internal_organ_or_component”,
“internal_organ_or_component-time”,
“time-kidney_disease”,
“kidney_disease-time”,
“time-ldl”,
“ldl-time”,
“time-modifier”,
“modifier-time”,
“time-o2_saturation”,
“o2_saturation-time”,
“time-obesity”,
“obesity-time”,
“time-oncological”,
“oncological-time”,
“time-overweight”,
“overweight-time”,
“time-oxygen_therapy”,
“oxygen_therapy-time”,
“time-pregnancy”,
“pregnancy-time”,
“time-procedure”,
“procedure-time”,
“time-psychological_condition”,
“psychological_condition-time”,
“time-pulse”,
“pulse-time”,
“time-respiration”,
“respiration-time”,
“time-smoking”,
“smoking-time”,
“time-substance”,
“substance-time”,
“time-substance_quantity”,
“substance_quantity-time”,
“time-symptom”,
“symptom-time”,
“time-temperature”,
“temperature-time”,
“time-test”,
“test-time”,
“time-test_result”,
“test_result-time”,
“time-total_cholesterol”,
“total_cholesterol-time”,
“time-treatment”,
“treatment-time”,
“time-triglycerides”,
“triglycerides-time”,
“time-vs_finding”,
“vs_finding-time”,
“time-vaccine”,
“vaccine-time”,
“time-vital_signs_header”,
“vital_signs_header-time”,
“time-weight”,
“weight-time”,
“relativedate-admission_discharge”,
“admission_discharge-relativedate”,
“relativedate-alcohol”,
“alcohol-relativedate”,
“relativedate-allergen”,
“allergen-relativedate”,
“relativedate-bmi”,
“bmi-relativedate”,
“relativedate-birth_entity”,
“birth_entity-relativedate”,
“relativedate-blood_pressure”,
“blood_pressure-relativedate”,
“relativedate-cerebrovascular_disease”,
“cerebrovascular_disease-relativedate”,
“relativedate-clinical_dept”,
“clinical_dept-relativedate”,
“relativedate-communicable_disease”,
“communicable_disease-relativedate”,
“relativedate-death_entity”,
“death_entity-relativedate”,
“relativedate-diabetes”,
“diabetes-relativedate”,
“relativedate-diet”,
“diet-relativedate”,
“relativedate-disease_syndrome_disorder”,
“disease_syndrome_disorder-relativedate”,
“relativedate-drug_brandname”,
“drug_brandname-relativedate”,
“relativedate-drug_ingredient”,
“drug_ingredient-relativedate”,
“relativedate-ekg_findings”,
“ekg_findings-relativedate”,
“relativedate-external_body_part_or_region”,
“external_body_part_or_region-relativedate”,
“relativedate-fetus_newborn”,
“fetus_newborn-relativedate”,
“relativedate-hdl”,
“hdl-relativedate”,
“relativedate-heart_disease”,
“heart_disease-relativedate”,
“relativedate-height”,
“height-relativedate”,
“relativedate-hyperlipidemia”,
“hyperlipidemia-relativedate”,
“relativedate-hypertension”,
“hypertension-relativedate”,
“relativedate-imagingfindings”,
“imagingfindings-relativedate”,
“relativedate-imaging_technique”,
“imaging_technique-relativedate”,
“relativedate-injury_or_poisoning”,
“injury_or_poisoning-relativedate”,
“relativedate-internal_organ_or_component”,
“internal_organ_or_component-relativedate”,
“relativedate-kidney_disease”,
“kidney_disease-relativedate”,
“relativedate-ldl”,
“ldl-relativedate”,
“relativedate-modifier”,
“modifier-relativedate”,
“relativedate-o2_saturation”,
“o2_saturation-relativedate”,
“relativedate-obesity”,
“obesity-relativedate”,
“relativedate-oncological”,
“oncological-relativedate”,
“relativedate-overweight”,
“overweight-relativedate”,
“relativedate-oxygen_therapy”,
“oxygen_therapy-relativedate”,
“relativedate-pregnancy”,
“pregnancy-relativedate”,
“relativedate-procedure”,
“procedure-relativedate”,
“relativedate-psychological_condition”,
“psychological_condition-relativedate”,
“relativedate-pulse”,
“pulse-relativedate”,
“relativedate-respiration”,
“respiration-relativedate”,
“relativedate-smoking”,
“smoking-relativedate”,
“relativedate-substance”,
“substance-relativedate”,
“relativedate-substance_quantity”,
“substance_quantity-relativedate”,
“relativedate-symptom”,
“symptom-relativedate”,
“relativedate-temperature”,
“temperature-relativedate”,
“relativedate-test”,
“test-relativedate”,
“relativedate-test_result”,
“test_result-relativedate”,
“relativedate-total_cholesterol”,
“total_cholesterol-relativedate”,
“relativedate-treatment”,
“treatment-relativedate”,
“relativedate-triglycerides”,
“triglycerides-relativedate”,
“relativedate-vs_finding”,
“vs_finding-relativedate”,
“relativedate-vaccine”,
“vaccine-relativedate”,
“relativedate-vital_signs_header”,
“vital_signs_header-relativedate”,
“relativedate-weight”,
“weight-relativedate”,
“relativetime-admission_discharge”,
“admission_discharge-relativetime”,
“relativetime-alcohol”,
“alcohol-relativetime”,
“relativetime-allergen”,
“allergen-relativetime”,
“relativetime-bmi”,
“bmi-relativetime”,
“relativetime-birth_entity”,
“birth_entity-relativetime”,
“relativetime-blood_pressure”,
“blood_pressure-relativetime”,
“relativetime-cerebrovascular_disease”,
“cerebrovascular_disease-relativetime”,
“relativetime-clinical_dept”,
“clinical_dept-relativetime”,
“relativetime-communicable_disease”,
“communicable_disease-relativetime”,
“relativetime-death_entity”,
“death_entity-relativetime”,
“relativetime-diabetes”,
“diabetes-relativetime”,
“relativetime-diet”,
“diet-relativetime”,
“relativetime-disease_syndrome_disorder”,
“disease_syndrome_disorder-relativetime”,
“relativetime-drug_brandname”,
“drug_brandname-relativetime”,
“relativetime-drug_ingredient”,
“drug_ingredient-relativetime”,
“relativetime-ekg_findings”,
“ekg_findings-relativetime”,
“relativetime-external_body_part_or_region”,
“external_body_part_or_region-relativetime”,
“relativetime-fetus_newborn”,
“fetus_newborn-relativetime”,
“relativetime-hdl”,
“hdl-relativetime”,
“relativetime-heart_disease”,
“heart_disease-relativetime”,
“relativetime-height”,
“height-relativetime”,
“relativetime-hyperlipidemia”,
“hyperlipidemia-relativetime”,
“relativetime-hypertension”,
“hypertension-relativetime”,
“relativetime-imagingfindings”,
“imagingfindings-relativetime”,
“relativetime-imaging_technique”,
“imaging_technique-relativetime”,
“relativetime-injury_or_poisoning”,
“injury_or_poisoning-relativetime”,
“relativetime-internal_organ_or_component”,
“internal_organ_or_component-relativetime”,
“relativetime-kidney_disease”,
“kidney_disease-relativetime”,
“relativetime-ldl”,
“ldl-relativetime”,
“relativetime-modifier”,
“modifier-relativetime”,
“relativetime-o2_saturation”,
“o2_saturation-relativetime”,
“relativetime-obesity”,
“obesity-relativetime”,
“relativetime-oncological”,
“oncological-relativetime”,
“relativetime-overweight”,
“overweight-relativetime”,
“relativetime-oxygen_therapy”,
“oxygen_therapy-relativetime”,
“relativetime-pregnancy”,
“pregnancy-relativetime”,
“relativetime-procedure”,
“procedure-relativetime”,
“relativetime-psychological_condition”,
“psychological_condition-relativetime”,
“relativetime-pulse”,
“pulse-relativetime”,
“relativetime-respiration”,
“respiration-relativetime”,
“relativetime-smoking”,
“smoking-relativetime”,
“relativetime-substance”,
“substance-relativetime”,
“relativetime-substance_quantity”,
“substance_quantity-relativetime”,
“relativetime-symptom”,
“symptom-relativetime”,
“relativetime-temperature”,
“temperature-relativetime”,
“relativetime-test”,
“test-relativetime”,
“relativetime-test_result”,
“test_result-relativetime”,
“relativetime-total_cholesterol”,
“total_cholesterol-relativetime”,
“relativetime-treatment”,
“treatment-relativetime”,
“relativetime-triglycerides”,
“triglycerides-relativetime”,
“relativetime-vs_finding”,
“vs_finding-relativetime”,
“relativetime-vaccine”,
“vaccine-relativetime”,
“relativetime-vital_signs_header”,
“vital_signs_header-relativetime”,
“relativetime-weight”,
“weight-relativetime”] | + + +Use as part of an nlp pipeline with the following stages: DocumentAssembler, SentenceDetector, Tokenizer, PerceptronModel, DependencyParserModel, WordEmbeddingsModel, NerDLModel, NerConverter, RelationExtractionModel. + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documenter = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentencer = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentences") + +tokenizer = Tokenizer()\ + .setInputCols(["sentences"])\ + .setOutputCol("tokens") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("embeddings") + +pos_tagger = PerceptronModel()\ + .pretrained("pos_clinical", "en", "clinical/models") \ + .setInputCols(["sentences", "tokens"])\ + .setOutputCol("pos_tags") + +ner_tagger = MedicalNerModel().pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models")\ + .setInputCols("sentences", "tokens", "embeddings")\ + .setOutputCol("ner_tags") + +ner_chunker = NerConverterInternal()\ + .setInputCols(["sentences", "tokens", "ner_tags"])\ + .setOutputCol("ner_chunks") + +dependency_parser = DependencyParserModel()\ + .pretrained("dependency_conllu", "en")\ + .setInputCols(["sentences", "pos_tags", "tokens"])\ + .setOutputCol("dependencies") + +re_model = RelationExtractionModel().pretrained("re_date_clinical", "en", "clinical/models")\ + .setInputCols(["embeddings", "pos_tags", "ner_chunks", "dependencies"])\ + .setOutputCol("relations")\ + .setMaxSyntacticDistance(3)\ + .setPredictionThreshold(0.9)\ + .setRelationPairs(["test-date", "symptom-date"]) # Possible relation pairs. Default: All Relations. + +nlp_pipeline = Pipeline(stages=[documenter, sentencer,tokenizer, word_embeddings, pos_tagger, ner_tagger, ner_chunker, dependency_parser, re_model]) + +light_pipeline = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))) + +annotations = light_pipeline.fullAnnotate('''This 73 y/o patient had CT on 1/12/95, with progressive memory and cognitive decline since 8/11/94.''') +``` + +```scala +val documenter = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentencer = new SentenceDetector() + .setInputCols(["document"]) + .setOutputCol("sentences") + +val tokenizer = new Tokenizer() + .setInputCols("sentences") + .setOutputCol("tokens") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("embeddings") + +val pos_tagger = PerceptronModel() + .pretrained("pos_clinical", "en", "clinical/models") + .setInputCols(Array("sentences", "tokens")) + .setOutputCol("pos_tags") + +val ner_tagger = MedicalNerModel().pretrained("jsl_ner_wip_greedy_clinical","en","clinical/models") + .setInputCols(Array("sentences", "tokens", "embeddings")) + .setOutputCol("ner_tags") + +val ner_chunker = new NerConverterInternal() + .setInputCols(Array("sentences", "tokens", "ner_tags")) + .setOutputCol("ner_chunks") + +val dependency_parser = DependencyParserModel() + .pretrained("dependency_conllu", "en") + .setInputCols(Array("sentences", "pos_tags", "tokens")) + .setOutputCol("dependencies") + +val re_model = RelationExtractionModel() + .pretrained("re_date", "en", "clinical/models") + .setInputCols(Array("embeddings", "pos_tags", "ner_chunks", "dependencies")) + .setOutputCol("relations") + .setMaxSyntacticDistance(3) #default: 0 + .setPredictionThreshold(0.9) #default: 0.5 + .setRelationPairs(Array("test-date", "symptom-date")) # Possible relation pairs. Default: All Relations. + +val nlpPipeline = new Pipeline().setStages(Array(documenter, sentencer,tokenizer, word_embeddings, pos_tagger, ner_tagger, ner_chunker, dependency_parser, re_model)) + +val result = pipeline.fit(Seq.empty[String]).transform(data) + +val annotations = light_pipeline.fullAnnotate("""This 73 y/o patient had CT on 1/12/95, with progressive memory and cognitive decline since 8/11/94.""") +``` + +
+ +## Results + +```bash +| | relations | entity1 | entity1_begin | entity1_end | chunk1 | entity2 | entity2_end | entity2_end | chunk2 | confidence | +|---|-----------|---------|---------------|-------------|------------------------------------------|---------|-------------|-------------|---------|------------| +| 0 | 1 | Test | 24 | 25 | CT | Date | 31 | 37 | 1/12/95 | 1.0 | +| 1 | 1 | Symptom | 45 | 84 | progressive memory and cognitive decline | Date | 92 | 98 | 8/11/94 | 1.0 | +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|re_date_clinical| +|Type:|re| +|Compatibility:|Spark NLP 2.7.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[embeddings, pos_tags, train_ner_chunks, dependencies]| +|Output Labels:|[relations]| +|Language:|en| +|Dependencies:|embeddings_clinical| + +## Data Source + +Trained on data gathered and manually annotated by John Snow Labs + +## Benchmarking + +```bash +label recall precision f1 +0 0.74 0.71 0.72 +1 0.94 0.95 0.94 ``` \ No newline at end of file diff --git a/docs/_posts/aydinmyilmaz/2021-01-20-ner_deid_augmented_en.md b/docs/_posts/aydinmyilmaz/2021-01-20-ner_deid_augmented_en.md index 487df76a659aa..d26b0dffcfffb 100644 --- a/docs/_posts/aydinmyilmaz/2021-01-20-ner_deid_augmented_en.md +++ b/docs/_posts/aydinmyilmaz/2021-01-20-ner_deid_augmented_en.md @@ -1,138 +1,178 @@ ---- -layout: model -title: Detect PHI for Deidentification (Augmented) -author: John Snow Labs -name: ner_deid_augmented -date: 2021-01-20 -task: Named Entity Recognition -language: en -edition: Healthcare NLP 2.7.1 -spark_version: 2.4 -tags: [en, deidentify, ner, clinical, licensed] -supported: true -article_header: -type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -Deidentification NER (Augmented) is a Named Entity Recognition model that annotates text to find protected health information that may need to be deidentified. - -We sticked to official annotation guideline (AG) for 2014 i2b2 Deid challenge while annotating new datasets for this model. All the details regarding the nuances and explanations for AG can be found here [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4978170/](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4978170/) - -## Predicted Entities - -`AGE`, `CONTACT`, `DATE`, `ID`, `LOCATION`, `NAME`, `PROFESSION` - -{:.btn-box} -[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_DEMOGRAPHICS/){:.button.button-orange} -[Open in Colab](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/3de6f25c23cd487d829ac3ce444ef19cfbe02631/tutorials/Certification_Trainings/Healthcare/4.Clinical_DeIdentificiation.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_deid_augmented_en_2.7.1_2.4_1611145829422.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - -This model is trained with the ‘embeddings_clinical’ word embeddings, so be sure to use the same embeddings within the pipeline in addition to document assembler, sentence detector, tokenizer and ner converter . - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} -```python -ner_model = NerDLModel.pretrained("ner_deid_augmented","en","clinical/models")\ - .setInputCols(["sentence","token","word_embeddings"])\ - .setOutputCol("ner") - -nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, ner_model, ner_converter]) - -model = nlpPipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - -results = model.transform(spark.createDataFrame([['HISTORY OF PRESENT ILLNESS: Mr. Smith is a 60-year-old white male veteran with multiple comorbidities, who has a history of bladder cancer diagnosed approximately two years ago by the VA Hospital, Dr. John Green (2347165768). He underwent a resection there. He was to be admitted to the Day Hospital for cystectomy. He was seen in Urology Clinic and Radiology Clinic on 02/04/2003. HOSPITAL COURSE: Mr. Smith presented to the Day Hospital in anticipation for Urology surgery. On evaluation, EKG, echocardiogram was abnormal, a Cardiology consult was obtained. A cardiac adenosine stress MRI was then proceeded, same was positive for inducible ischemia, mild-to-moderate inferolateral subendocardial infarction with peri-infarct ischemia. In addition, inducible ischemia seen in the inferior lateral septum. Mr. Smith underwent a left heart catheterization, which revealed two vessel coronary artery disease. The RCA, proximal was 95% stenosed and the distal 80% stenosed. The mid LAD was 85% stenosed and the distal LAD was 85% stenosed. There was four Multi-Link Vision bare metal stents placed to decrease all four lesions to 0%. Following intervention, Mr. Smith was admitted to 7 Ardmore Tower under Cardiology Service under the direction of Dr. Hart. Mr. Smith had a noncomplicated post-intervention hospital course. He was stable for discharge home on 02/07/2003 with instructions to take Plavix daily for one month and Urology is aware of the same. ']], ["text"])) - -``` - -```scala -... -val ner_model = NerDLModel.pretrained("ner_deid_augmented","en","clinical/models") - .setInputCols(Array("sentence","token","word_embeddings")) - .setOutputCol("ner") - -val nlpPipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, ner_model, ner_converter)) - -val result = pipeline.fit(Seq.empty[String]).transform(data) - -val results = LightPipeline(model).fullAnnotate('''HISTORY OF PRESENT ILLNESS: Mr. Smith is a 60-year-old white male veteran with multiple comorbidities, who has a history of bladder cancer diagnosed approximately two years ago by the VA Hospital, Dr. John Green (2347165768). He underwent a resection there. He was to be admitted to the Day Hospital for cystectomy. He was seen in Urology Clinic and Radiology Clinic on 02/04/2003. HOSPITAL COURSE: Mr. Smith presented to the Day Hospital in anticipation for Urology surgery. On evaluation, EKG, echocardiogram was abnormal, a Cardiology consult was obtained. A cardiac adenosine stress MRI was then proceeded, same was positive for inducible ischemia, mild-to-moderate inferolateral subendocardial infarction with peri-infarct ischemia. In addition, inducible ischemia seen in the inferior lateral septum. Mr. Smith underwent a left heart catheterization, which revealed two vessel coronary artery disease. The RCA, proximal was 95% stenosed and the distal 80% stenosed. The mid LAD was 85% stenosed and the distal LAD was 85% stenosed. There was four Multi-Link Vision bare metal stents placed to decrease all four lesions to 0%. Following intervention, Mr. Smith was admitted to 7 Ardmore Tower under Cardiology Service under the direction of Dr. Hart. Mr. Smith had a noncomplicated post-intervention hospital course. He was stable for discharge home on 02/07/2003 with instructions to take Plavix daily for one month and Urology is aware of the same.''') -``` - - - -{:.nlu-block} -```python -import nlu -nlu.load("en.med_ner.deid.augmented").predict("""HISTORY OF PRESENT ILLNESS: Mr. Smith is a 60-year-old white male veteran with multiple comorbidities, who has a history of bladder cancer diagnosed approximately two years ago by the VA Hospital, Dr. John Green (2347165768). He underwent a resection there. He was to be admitted to the Day Hospital for cystectomy. He was seen in Urology Clinic and Radiology Clinic on 02/04/2003. HOSPITAL COURSE: Mr. Smith presented to the Day Hospital in anticipation for Urology surgery. On evaluation, EKG, echocardiogram was abnormal, a Cardiology consult was obtained. A cardiac adenosine stress MRI was then proceeded, same was positive for inducible ischemia, mild-to-moderate inferolateral subendocardial infarction with peri-infarct ischemia. In addition, inducible ischemia seen in the inferior lateral septum. Mr. Smith underwent a left heart catheterization, which revealed two vessel coronary artery disease. The RCA, proximal was 95% stenosed and the distal 80% stenosed. The mid LAD was 85% stenosed and the distal LAD was 85% stenosed. There was four Multi-Link Vision bare metal stents placed to decrease all four lesions to 0%. Following intervention, Mr. Smith was admitted to 7 Ardmore Tower under Cardiology Service under the direction of Dr. Hart. Mr. Smith had a noncomplicated post-intervention hospital course. He was stable for discharge home on 02/07/2003 with instructions to take Plavix daily for one month and Urology is aware of the same. """) -``` - -
- -## Results - -```bash -+---------------+---------+ -|chunk |ner_label| -+---------------+---------+ -|Smith |NAME | -|VA Hospital |LOCATION | -|John Green |NAME | -|2347165768 |ID | -|Day Hospital |LOCATION | -|02/04/2003 |DATE | -|Smith |NAME | -|Day Hospital |LOCATION | -|Smith |NAME | -|Smith |NAME | -|7 Ardmore Tower|LOCATION | -|Hart |NAME | -|Smith |NAME | -|02/07/2003 |DATE | -+---------------+---------+ -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|ner_deid_augmented| -|Type:|ner| -|Compatibility:|Spark NLP 2.7.1+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[sentence, token, embeddings]| -|Output Labels:|[ner]| -|Language:|en| -|Dependencies:|embeddings_clinical| - -## Data Source - -Trained on plain n2c2 2014: De-identification and Heart Disease Risk Factors Challenge datasets with embeddings_clinical https://portal.dbmi.hms.harvard.edu/projects/n2c2-2014/ - -## Benchmarking - -```bash -| | label | tp | fp | fn | prec | rec | f1 | -|---:|--------------:|------:|------:|------:|---------:|---------:|---------:| -| 0 | I-NAME | 1096 | 47 | 80 | 0.95888 | 0.931973 | 0.945235 | -| 1 | I-CONTACT | 93 | 0 | 4 | 1 | 0.958763 | 0.978947 | -| 2 | I-AGE | 3 | 1 | 6 | 0.75 | 0.333333 | 0.461538 | -| 3 | B-DATE | 2078 | 42 | 52 | 0.980189 | 0.975587 | 0.977882 | -| 4 | I-DATE | 474 | 39 | 25 | 0.923977 | 0.9499 | 0.936759 | -| 5 | I-LOCATION | 755 | 68 | 76 | 0.917375 | 0.908544 | 0.912938 | -| 6 | I-PROFESSION | 78 | 8 | 9 | 0.906977 | 0.896552 | 0.901734 | -| 7 | B-NAME | 1182 | 101 | 36 | 0.921278 | 0.970443 | 0.945222 | -| 8 | B-AGE | 259 | 10 | 11 | 0.962825 | 0.959259 | 0.961039 | -| 9 | B-ID | 146 | 8 | 11 | 0.948052 | 0.929936 | 0.938907 | -| 10 | B-PROFESSION | 76 | 9 | 21 | 0.894118 | 0.783505 | 0.835165 | -| 11 | B-LOCATION | 556 | 87 | 71 | 0.864697 | 0.886762 | 0.875591 | -| 12 | I-ID | 64 | 8 | 3 | 0.888889 | 0.955224 | 0.920863 | -| 13 | B-CONTACT | 40 | 7 | 5 | 0.851064 | 0.888889 | 0.869565 | -| 14 | Macro-average | 6900 | 435 | 410 | 0.912023 | 0.880619 | 0.896046 | -| 15 | Micro-average | 6900 | 435 | 410 | 0.940695 | 0.943912 | 0.942301 | -``` \ No newline at end of file +--- +layout: model +title: Detect PHI for Deidentification (Augmented) +author: John Snow Labs +name: ner_deid_augmented +date: 2021-01-20 +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 2.7.1 +spark_version: 2.4 +tags: [en, deidentify, ner, clinical, licensed] +supported: true +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Deidentification NER (Augmented) is a Named Entity Recognition model that annotates text to find protected health information that may need to be deidentified. + +We sticked to official annotation guideline (AG) for 2014 i2b2 Deid challenge while annotating new datasets for this model. All the details regarding the nuances and explanations for AG can be found here [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4978170/](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4978170/) + +## Predicted Entities + +`AGE`, `CONTACT`, `DATE`, `ID`, `LOCATION`, `NAME`, `PROFESSION` + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_DEMOGRAPHICS/){:.button.button-orange} +[Open in Colab](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/3de6f25c23cd487d829ac3ce444ef19cfbe02631/tutorials/Certification_Trainings/Healthcare/4.Clinical_DeIdentificiation.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_deid_augmented_en_2.7.1_2.4_1611145829422.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + +This model is trained with the ‘embeddings_clinical’ word embeddings, so be sure to use the same embeddings within the pipeline in addition to document assembler, sentence detector, tokenizer and ner converter . + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +ner_model = NerDLModel.pretrained("ner_deid_augmented","en","clinical/models")\ + .setInputCols(["sentence","token","word_embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(['document', 'token', 'ner']) \ + .setOutputCol('ner_chunk') + +nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, ner_model, ner_converter]) + +model = nlpPipeline.fit(spark.createDataFrame([[""]]).toDF("text")) + +results = model.transform(spark.createDataFrame([['HISTORY OF PRESENT ILLNESS: Mr. Smith is a 60-year-old white male veteran with multiple comorbidities, who has a history of bladder cancer diagnosed approximately two years ago by the VA Hospital, Dr. John Green (2347165768). He underwent a resection there. He was to be admitted to the Day Hospital for cystectomy. He was seen in Urology Clinic and Radiology Clinic on 02/04/2003. HOSPITAL COURSE: Mr. Smith presented to the Day Hospital in anticipation for Urology surgery. On evaluation, EKG, echocardiogram was abnormal, a Cardiology consult was obtained. A cardiac adenosine stress MRI was then proceeded, same was positive for inducible ischemia, mild-to-moderate inferolateral subendocardial infarction with peri-infarct ischemia. In addition, inducible ischemia seen in the inferior lateral septum. Mr. Smith underwent a left heart catheterization, which revealed two vessel coronary artery disease. The RCA, proximal was 95% stenosed and the distal 80% stenosed. The mid LAD was 85% stenosed and the distal LAD was 85% stenosed. There was four Multi-Link Vision bare metal stents placed to decrease all four lesions to 0%. Following intervention, Mr. Smith was admitted to 7 Ardmore Tower under Cardiology Service under the direction of Dr. Hart. Mr. Smith had a noncomplicated post-intervention hospital course. He was stable for discharge home on 02/07/2003 with instructions to take Plavix daily for one month and Urology is aware of the same. ']], ["text"])) +``` + +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner_model = NerDLModel.pretrained("ner_deid_augmented","en","clinical/models") + .setInputCols(Array("sentence","token","word_embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("document", "token", "ner")) + .setOutputCol("ner_chunk") + +val nlpPipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, ner_model, ner_converter)) + +val result = pipeline.fit(Seq.empty[String]).transform(data) + +val results = LightPipeline(model).fullAnnotate("""HISTORY OF PRESENT ILLNESS: Mr. Smith is a 60-year-old white male veteran with multiple comorbidities, who has a history of bladder cancer diagnosed approximately two years ago by the VA Hospital, Dr. John Green (2347165768). He underwent a resection there. He was to be admitted to the Day Hospital for cystectomy. He was seen in Urology Clinic and Radiology Clinic on 02/04/2003. HOSPITAL COURSE: Mr. Smith presented to the Day Hospital in anticipation for Urology surgery. On evaluation, EKG, echocardiogram was abnormal, a Cardiology consult was obtained. A cardiac adenosine stress MRI was then proceeded, same was positive for inducible ischemia, mild-to-moderate inferolateral subendocardial infarction with peri-infarct ischemia. In addition, inducible ischemia seen in the inferior lateral septum. Mr. Smith underwent a left heart catheterization, which revealed two vessel coronary artery disease. The RCA, proximal was 95% stenosed and the distal 80% stenosed. The mid LAD was 85% stenosed and the distal LAD was 85% stenosed. There was four Multi-Link Vision bare metal stents placed to decrease all four lesions to 0%. Following intervention, Mr. Smith was admitted to 7 Ardmore Tower under Cardiology Service under the direction of Dr. Hart. Mr. Smith had a noncomplicated post-intervention hospital course. He was stable for discharge home on 02/07/2003 with instructions to take Plavix daily for one month and Urology is aware of the same.""") +``` + + + +{:.nlu-block} + +```python +import nlu +nlu.load("en.med_ner.deid.augmented").predict("""HISTORY OF PRESENT ILLNESS: Mr. Smith is a 60-year-old white male veteran with multiple comorbidities, who has a history of bladder cancer diagnosed approximately two years ago by the VA Hospital, Dr. John Green (2347165768). He underwent a resection there. He was to be admitted to the Day Hospital for cystectomy. He was seen in Urology Clinic and Radiology Clinic on 02/04/2003. HOSPITAL COURSE: Mr. Smith presented to the Day Hospital in anticipation for Urology surgery. On evaluation, EKG, echocardiogram was abnormal, a Cardiology consult was obtained. A cardiac adenosine stress MRI was then proceeded, same was positive for inducible ischemia, mild-to-moderate inferolateral subendocardial infarction with peri-infarct ischemia. In addition, inducible ischemia seen in the inferior lateral septum. Mr. Smith underwent a left heart catheterization, which revealed two vessel coronary artery disease. The RCA, proximal was 95% stenosed and the distal 80% stenosed. The mid LAD was 85% stenosed and the distal LAD was 85% stenosed. There was four Multi-Link Vision bare metal stents placed to decrease all four lesions to 0%. Following intervention, Mr. Smith was admitted to 7 Ardmore Tower under Cardiology Service under the direction of Dr. Hart. Mr. Smith had a noncomplicated post-intervention hospital course. He was stable for discharge home on 02/07/2003 with instructions to take Plavix daily for one month and Urology is aware of the same. """) +``` + +
+ +## Results + +```bash ++---------------+---------+ +|chunk |ner_label| ++---------------+---------+ +|Smith |NAME | +|VA Hospital |LOCATION | +|John Green |NAME | +|2347165768 |ID | +|Day Hospital |LOCATION | +|02/04/2003 |DATE | +|Smith |NAME | +|Day Hospital |LOCATION | +|Smith |NAME | +|Smith |NAME | +|7 Ardmore Tower|LOCATION | +|Hart |NAME | +|Smith |NAME | +|02/07/2003 |DATE | ++---------------+---------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_deid_augmented| +|Type:|ner| +|Compatibility:|Spark NLP 2.7.1+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, embeddings]| +|Output Labels:|[ner]| +|Language:|en| +|Dependencies:|embeddings_clinical| + +## Data Source + +Trained on plain n2c2 2014: De-identification and Heart Disease Risk Factors Challenge datasets with embeddings_clinical https://portal.dbmi.hms.harvard.edu/projects/n2c2-2014/ + +## Benchmarking + +```bash +| | label | tp | fp | fn | prec | rec | f1 | +|---:|--------------:|------:|------:|------:|---------:|---------:|---------:| +| 0 | I-NAME | 1096 | 47 | 80 | 0.95888 | 0.931973 | 0.945235 | +| 1 | I-CONTACT | 93 | 0 | 4 | 1 | 0.958763 | 0.978947 | +| 2 | I-AGE | 3 | 1 | 6 | 0.75 | 0.333333 | 0.461538 | +| 3 | B-DATE | 2078 | 42 | 52 | 0.980189 | 0.975587 | 0.977882 | +| 4 | I-DATE | 474 | 39 | 25 | 0.923977 | 0.9499 | 0.936759 | +| 5 | I-LOCATION | 755 | 68 | 76 | 0.917375 | 0.908544 | 0.912938 | +| 6 | I-PROFESSION | 78 | 8 | 9 | 0.906977 | 0.896552 | 0.901734 | +| 7 | B-NAME | 1182 | 101 | 36 | 0.921278 | 0.970443 | 0.945222 | +| 8 | B-AGE | 259 | 10 | 11 | 0.962825 | 0.959259 | 0.961039 | +| 9 | B-ID | 146 | 8 | 11 | 0.948052 | 0.929936 | 0.938907 | +| 10 | B-PROFESSION | 76 | 9 | 21 | 0.894118 | 0.783505 | 0.835165 | +| 11 | B-LOCATION | 556 | 87 | 71 | 0.864697 | 0.886762 | 0.875591 | +| 12 | I-ID | 64 | 8 | 3 | 0.888889 | 0.955224 | 0.920863 | +| 13 | B-CONTACT | 40 | 7 | 5 | 0.851064 | 0.888889 | 0.869565 | +| 14 | Macro-average | 6900 | 435 | 410 | 0.912023 | 0.880619 | 0.896046 | +| 15 | Micro-average | 6900 | 435 | 410 | 0.940695 | 0.943912 | 0.942301 | +``` diff --git a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_bert_es_3_0.md b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_bert_es_3_0.md index ee7f1ea0c81bf..402300b47dca6 100644 --- a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_bert_es_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_bert_es_3_0.md @@ -71,11 +71,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Lactante varón de dos años. Antecedentes familiares sin interés. Antecedentes personales: Embarazo, parto y periodo neonatal normal. En seguimiento por alergia a legumbres, diagnosticado con diez meses por reacción urticarial generalizada con lentejas y garbanzos, con dieta de exclusión a legumbres desde entonces. En ésta visita la madre describe episodios de eritema en zona maxilar derecha con afectación ocular ipsilateral que se resuelve en horas tras la administración de corticoides. Le ha ocurrido en 5-6 ocasiones, en relación con la ingesta de alimentos previamente tolerados. Exploración complementaria: Cacahuete, ac(ige)19.2 Ku.arb/l. Resultados: Ante la sospecha clínica de Síndrome de Frey, se tranquiliza a los padres, explicándoles la naturaleza del cuadro y se cita para revisión anual."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -102,7 +100,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_bert_pt_3_0.md b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_bert_pt_3_0.md index 7253a2616dcba..147c62158366e 100644 --- a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_bert_pt_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_bert_pt_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Uma rapariga de 16 anos com um historial pessoal de asma apresentou ao departamento de dermatologia com lesões cutâneas assintomáticas que tinham estado presentes durante 2 meses. A paciente tinha sido tratada com creme corticosteróide devido a uma suspeita inicial de eczema atópico, apesar do qual apresentava um crescimento progressivo marcado das lesões. Tinha um gato doméstico que ela nunca tinha levado ao veterinário. O exame físico revelou placas em forma de anel com uma borda periférica activa na parte superior das costas e nos aspectos laterais do pescoço e da face. Cultura local obtida por raspagem de tapete isolado Trichophyton rubrum. Com base em dados clínicos e cultura, foi estabelecido o diagnóstico de tinea incognito."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_biobert_en_3_0.md b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_biobert_en_3_0.md index 1ae3f0375708b..779ae8d866948 100644 --- a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_biobert_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_biobert_en_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""42-year-old woman with end-stage chronic kidney disease, secondary to lupus nephropathy, and on peritoneal dialysis. History of four episodes of bacterial peritonitis and change of Tenckhoff catheter six months prior to admission due to catheter dysfunction. Three peritoneal fluid samples during her hospitalisation tested positive for Fusarium spp. The patient responded favourably and continued outpatient treatment with voriconazole (4mg/kg every 12 hours orally). All three isolates were identified as species of the Fusarium solani complex. In vitro susceptibility to itraconazole, voriconazole and posaconazole, according to Clinical and Laboratory Standards Institute - CLSI (M38-A) methodology, showed a minimum inhibitory concentration (MIC) in all three isolates and for all three antifungals of >16 μg/mL."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_en_3_0.md b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_en_3_0.md index c79f79d3021d5..412f275ade114 100644 --- a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_en_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""42-year-old woman with end-stage chronic kidney disease, secondary to lupus nephropathy, and on peritoneal dialysis. History of four episodes of bacterial peritonitis and change of Tenckhoff catheter six months prior to admission due to catheter dysfunction. Three peritoneal fluid samples during her hospitalisation tested positive for Fusarium spp. The patient responded favourably and continued outpatient treatment with voriconazole (4mg/kg every 12 hours orally). All three isolates were identified as species of the Fusarium solani complex. In vitro susceptibility to itraconazole, voriconazole and posaconazole, according to Clinical and Laboratory Standards Institute - CLSI (M38-A) methodology, showed a minimum inhibitory concentration (MIC) in all three isolates and for all three antifungals of >16 μg/mL."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_es_3_0.md b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_es_3_0.md index 7bf363f2423e7..3afbbd65af8ef 100644 --- a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_es_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_es_3_0.md @@ -71,11 +71,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Lactante varón de dos años. Antecedentes familiares sin interés. Antecedentes personales: Embarazo, parto y periodo neonatal normal. En seguimiento por alergia a legumbres, diagnosticado con diez meses por reacción urticarial generalizada con lentejas y garbanzos, con dieta de exclusión a legumbres desde entonces. En ésta visita la madre describe episodios de eritema en zona maxilar derecha con afectación ocular ipsilateral que se resuelve en horas tras la administración de corticoides. Le ha ocurrido en 5-6 ocasiones, en relación con la ingesta de alimentos previamente tolerados. Exploración complementaria: Cacahuete, ac(ige)19.2 Ku.arb/l. Resultados: Ante la sospecha clínica de Síndrome de Frey, se tranquiliza a los padres, explicándoles la naturaleza del cuadro y se cita para revisión anual."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -102,7 +100,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_pt_3_0.md b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_pt_3_0.md index 13c5b0a22874f..6d403d3dd4f49 100644 --- a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_pt_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_pt_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Uma rapariga de 16 anos com um historial pessoal de asma apresentou ao departamento de dermatologia com lesões cutâneas assintomáticas que tinham estado presentes durante 2 meses. A paciente tinha sido tratada com creme corticosteróide devido a uma suspeita inicial de eczema atópico, apesar do qual apresentava um crescimento progressivo marcado das lesões. Tinha um gato doméstico que ela nunca tinha levado ao veterinário. O exame físico revelou placas em forma de anel com uma borda periférica activa na parte superior das costas e nos aspectos laterais do pescoço e da face. Cultura local obtida por raspagem de tapete isolado Trichophyton rubrum. Com base em dados clínicos e cultura, foi estabelecido o diagnóstico de tinea incognito."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -88,11 +86,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","pt") @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_roberta_es_3_0.md b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_roberta_es_3_0.md index 3d88ee0ce8edc..b47abcd3c59c3 100644 --- a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_roberta_es_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_roberta_es_3_0.md @@ -71,11 +71,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Lactante varón de dos años. Antecedentes familiares sin interés. Antecedentes personales: Embarazo, parto y periodo neonatal normal. En seguimiento por alergia a legumbres, diagnosticado con diez meses por reacción urticarial generalizada con lentejas y garbanzos, con dieta de exclusión a legumbres desde entonces. En ésta visita la madre describe episodios de eritema en zona maxilar derecha con afectación ocular ipsilateral que se resuelve en horas tras la administración de corticoides. Le ha ocurrido en 5-6 ocasiones, en relación con la ingesta de alimentos previamente tolerados. Exploración complementaria: Cacahuete, ac(ige)19.2 Ku.arb/l. Resultados: Ante la sospecha clínica de Síndrome de Frey, se tranquiliza a los padres, explicándoles la naturaleza del cuadro y se cita para revisión anual."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -83,11 +81,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = RoBertaEmbeddings.pretrained("roberta_base_biomedical", "es") @@ -102,7 +100,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_roberta_pt_3_0.md b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_roberta_pt_3_0.md index 0339bb365a198..b59ecbe3765df 100644 --- a/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_roberta_pt_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-22-ner_living_species_roberta_pt_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Mulher de 23 anos, de Capinota, Cochabamba, Bolívia. Ela está no nosso país há quatro anos. Frequentou o departamento de emergência obstétrica onde foi encontrada grávida de 37 semanas, com um colo dilatado de 5 cm e membranas rompidas. O obstetra de emergência realizou um teste de estreptococos negativo e solicitou um hemograma, glucose, bioquímica básica, HBV, HCV e serologia da sífilis."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -88,11 +86,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_BR_BERTo","pt") @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_fr_3_0.md b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_fr_3_0.md index 1b05dfe7bb615..e35ee44a0ad69 100644 --- a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_fr_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_fr_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Femme de 47 ans allergique à l'iode, fumeuse sociale, opérée pour des varices, deux césariennes et un abcès fessier. Vit avec son mari et ses trois enfants, travaille comme enseignante. Initialement, le patient a eu une bonne évolution, mais au 2ème jour postopératoire, il a commencé à montrer une instabilité hémodynamique. Les sérologies pour Coxiella burnetii, Bartonella henselae, Borrelia burgdorferi, Entamoeba histolytica, Toxoplasma gondii, herpès simplex virus 1 et 2, cytomégalovirus, virus d'Epstein Barr, virus de la varicelle et du zona et parvovirus B19 étaient négatives. Cependant, un test au rose Bengale positif pour Brucella, le test de Coombs et les agglutinations étaient également positifs avec un titre de 1/40."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -88,11 +86,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_fr_cased", "fr") @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_it_3_0.md b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_it_3_0.md index 8c85d3abafd81..a23f3ba30bfe4 100644 --- a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_it_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_it_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Una donna di 74 anni è stata ricoverata con dolore addominale diffuso, ipossia e astenia di 2 settimane di evoluzione. La sua storia personale includeva ipertensione in trattamento con amiloride/idroclorotiazide e dislipidemia controllata con lovastatina. La sua storia familiare era: madre morta di cancro gastrico, fratello con cirrosi epatica di eziologia sconosciuta e sorella con carcinoma epatocellulare. Lo studio eziologico delle diverse cause di malattia epatica cronica comprendeva: virus epatotropi (HBV, HCV) e HIV, studio dell'autoimmunità, ceruloplasmina, ferritina e porfirine nelle urine, tutti risultati negativi. Il paziente è stato messo in trattamento anticoagulante con acenocumarolo e diuretici a tempo indeterminato."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -88,11 +86,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased", "it") @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_ro_3_0.md b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_ro_3_0.md index a7b86146a2c29..ad92d68b1316c 100644 --- a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_ro_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_bert_ro_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""O femeie în vârstă de 26 de ani, însărcinată în 11 săptămâni, a consultat serviciul de urgențe dermatologice pentru că prezenta, de 4 zile, leziuni punctiforme dureroase de debut brusc pe vârful degetelor. Pacientul raportează că leziunile au început pe degete și ulterior s-au extins la degetele de la picioare. Markerii de imunitate, ANA și crioagglutininele, au fost negativi, iar serologia VHB a indicat doar vaccinarea. Pe baza acestor rezultate, diagnosticul de vasculită a fost exclus și, având în vedere diagnosticul suspectat de erupție cutanată cu mănuși și șosete, s-a efectuat serologia pentru virusul Ebstein Barr. Exantemă la mănuși și șosete datorat parvovirozei B19. Având în vedere suspiciunea unei afecțiuni infecțioase cu aceste caracteristici, a fost solicitată serologia pentru EBV, enterovirus și parvovirus B19, cu IgM pozitiv pentru acesta din urmă în două ocazii. De asemenea, nu au existat semne de anemie fetală sau complicații ale acesteia."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -88,11 +86,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = BertEmbeddings.pretrained("bert_base_cased", "ro") @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_ca_3_0.md b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_ca_3_0.md index 3a329a66c9c86..829e348847b19 100644 --- a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_ca_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_ca_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Dona de 47 anys al·lèrgica al iode, fumadora social, intervinguda de varices, dues cesàries i un abscés gluti. Sense altres antecedents mèdics d'interès ni tractament habitual. Viu amb el seu marit i tres fills, treballa com a professora. En el moment de la nostra valoració en la planta de Cirurgia General, la pacient presenta TA 69/40 mm Hg, freqüència cardíaca 120 lpm, taquipnea en repòs, pal·lidesa mucocutánea, mala perfusió distal i afligeix nàusees. L'abdomen és tou, no presenta peritonismo i el dèbit del drenatge abdominal roman sense canvis. Les serologies de Coxiella burnetii, Bartonella henselae, Borrelia burgdorferi, Entamoeba histolytica, Toxoplasma gondii, citomegalovirus, virus de Epstein Barr, virus varicel·la zoster i parvovirus B19 van ser negatives. No obstant això, es va detectar test de rosa de Bengala positiu per a Brucella, el test de Coombs i les aglutinacions també van ser positives amb un títol 1/40."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -88,11 +86,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d", "ca") @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_fr_3_0.md b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_fr_3_0.md index 6698593253ffa..e573a0445dffa 100644 --- a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_fr_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_fr_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Femme de 47 ans allergique à l'iode, fumeuse sociale, opérée pour des varices, deux césariennes et un abcès fessier. Vit avec son mari et ses trois enfants, travaille comme enseignante. Initialement, le patient a eu une bonne évolution, mais au 2ème jour postopératoire, il a commencé à montrer une instabilité hémodynamique. Les sérologies pour Coxiella burnetii, Bartonella henselae, Borrelia burgdorferi, Entamoeba histolytica, Toxoplasma gondii, herpès simplex virus 1 et 2, cytomégalovirus, virus d'Epstein Barr, virus de la varicelle et du zona et parvovirus B19 étaient négatives. Cependant, un test au rose Bengale positif pour Brucella, le test de Coombs et les agglutinations étaient également positifs avec un titre de 1/40."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -88,11 +86,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d", "fr") @@ -107,7 +105,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_gl_3_0.md b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_gl_3_0.md index 8529fa7fac0e3..f9cc9483bb1d5 100644 --- a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_gl_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_gl_3_0.md @@ -76,11 +76,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Muller de 45 anos, sen antecedentes médicos de interese, que foi remitida á consulta de dermatoloxía de urxencias por lesións faciales de tres semanas de evolución. A paciente non presentaba lesións noutras localizaciones nin outra clínica de interese. No seu centro de saúde prescribíronlle corticoides tópicos ante a sospeita de picaduras de artrópodos e unha semana despois, antivirales orais baixo o diagnóstico de posible infección herpética. As lesións interferían de forma notable na súa vida persoal e profesional xa que traballaba de face ao púbico. Unha semana máis tarde o diagnóstico foi confirmado ao resultar o cultivo positivo a Staphylococcus aureus."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala @@ -89,11 +87,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d","gl") @@ -108,7 +106,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, @@ -117,7 +115,7 @@ ner_converter)) val data = Seq("""Muller de 45 anos, sen antecedentes médicos de interese, que foi remitida á consulta de dermatoloxía de urxencias por lesións faciales de tres semanas de evolución. A paciente non presentaba lesións noutras localizaciones nin outra clínica de interese. No seu centro de saúde prescribíronlle corticoides tópicos ante a sospeita de picaduras de artrópodos e unha semana despois, antivirales orais baixo o diagnóstico de posible infección herpética. As lesións interferían de forma notable na súa vida persoal e profesional xa que traballaba de face ao púbico. Unha semana máis tarde o diagnóstico foi confirmado ao resultar o cultivo positivo a Staphylococcus aureus.""").toDS.toDF("text") -val result = pipeline .fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_it_3_0.md b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_it_3_0.md index b1d2c45afc178..0dc6e0ee48ab3 100644 --- a/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_it_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-23-ner_living_species_it_3_0.md @@ -75,11 +75,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Una donna di 74 anni è stata ricoverata con dolore addominale diffuso, ipossia e astenia di 2 settimane di evoluzione. La sua storia personale includeva ipertensione in trattamento con amiloride/idroclorotiazide e dislipidemia controllata con lovastatina. La sua storia familiare era: madre morta di cancro gastrico, fratello con cirrosi epatica di eziologia sconosciuta e sorella con carcinoma epatocellulare. Lo studio eziologico delle diverse cause di malattia epatica cronica comprendeva: virus epatotropi (HBV, HCV) e HIV, studio dell'autoimmunità, ceruloplasmina, ferritina e porfirine nelle urine, tutti risultati negativi. Il paziente è stato messo in trattamento anticoagulante con acenocumarolo e diuretici a tempo indeterminato."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -87,11 +85,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = WordEmbeddingsModel.pretrained("w2v_cc_300d", "it") @@ -106,7 +104,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings, diff --git a/docs/_posts/bunyamin-polat/2022-06-26-bert_token_classifier_ner_living_species_en_3_0.md b/docs/_posts/bunyamin-polat/2022-06-26-bert_token_classifier_ner_living_species_en_3_0.md index 0fd9d9a28cdd7..a7a6829c76073 100644 --- a/docs/_posts/bunyamin-polat/2022-06-26-bert_token_classifier_ner_living_species_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-26-bert_token_classifier_ner_living_species_en_3_0.md @@ -73,11 +73,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""42-year-old woman with end-stage chronic kidney disease, secondary to lupus nephropathy, and on peritoneal dialysis. History of four episodes of bacterial peritonitis and change of Tenckhoff catheter six months prior to admission due to catheter dysfunction. Three peritoneal fluid samples during her hospitalisation tested positive for Fusarium spp. The patient responded favourably and continued outpatient treatment with voriconazole (4mg/kg every 12 hours orally). All three isolates were identified as species of the Fusarium solani complex. In vitro susceptibility to itraconazole, voriconazole and posaconazole, according to Clinical and Laboratory Standards Institute - CLSI (M38-A) methodology, showed a minimum inhibitory concentration (MIC) in all three isolates and for all three antifungals of >16 μg/mL."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -85,11 +83,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_living_species", "en", "clinical/models") @@ -102,7 +100,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array( +val pipeline = new Pipeline().setStages(Array( document_assembler, sentence_detector, tokenizer, diff --git a/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_es_3_0.md b/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_es_3_0.md index 9b6da443c8121..fb92bb71d4c4f 100644 --- a/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_es_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_es_3_0.md @@ -68,11 +68,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Lactante varón de dos años. Antecedentes familiares sin interés. Antecedentes personales: Embarazo, parto y periodo neonatal normal. En seguimiento por alergia a legumbres, diagnosticado con diez meses por reacción urticarial generalizada con lentejas y garbanzos, con dieta de exclusión a legumbres desde entonces. En ésta visita la madre describe episodios de eritema en zona maxilar derecha con afectación ocular ipsilateral que se resuelve en horas tras la administración de corticoides. Le ha ocurrido en 5-6 ocasiones, en relación con la ingesta de alimentos previamente tolerados. Exploración complementaria: Cacahuete, ac(ige)19.2 Ku.arb/l. Resultados: Ante la sospecha clínica de Síndrome de Frey, se tranquiliza a los padres, explicándoles la naturaleza del cuadro y se cita para revisión anual."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -80,11 +78,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_living_species", "es", "clinical/models") @@ -97,7 +95,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, diff --git a/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_it_3_0.md b/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_it_3_0.md index 316da0b502ed2..6b0f8443ce233 100644 --- a/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_it_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_it_3_0.md @@ -73,11 +73,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Una donna di 74 anni è stata ricoverata con dolore addominale diffuso, ipossia e astenia di 2 settimane di evoluzione. La sua storia personale includeva ipertensione in trattamento con amiloride/idroclorotiazide e dislipidemia controllata con lovastatina. La sua storia familiare era: madre morta di cancro gastrico, fratello con cirrosi epatica di eziologia sconosciuta e sorella con carcinoma epatocellulare. Lo studio eziologico delle diverse cause di malattia epatica cronica comprendeva: virus epatotropi (HBV, HCV) e HIV, studio dell'autoimmunità, ceruloplasmina, ferritina e porfirine nelle urine, tutti risultati negativi. Il paziente è stato messo in trattamento anticoagulante con acenocumarolo e diuretici a tempo indeterminato."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -85,11 +83,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_living_species", "it", "clinical/models") @@ -102,7 +100,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, diff --git a/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_pt_3_0.md b/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_pt_3_0.md index 5fda1df7e0f96..87c15965465e3 100644 --- a/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_pt_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-06-27-bert_token_classifier_ner_living_species_pt_3_0.md @@ -73,11 +73,9 @@ ner_model, ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Uma rapariga de 16 anos com um historial pessoal de asma apresentou ao departamento de dermatologia com lesões cutâneas assintomáticas que tinham estado presentes durante 2 meses. A paciente tinha sido tratada com creme corticosteróide devido a uma suspeita inicial de eczema atópico, apesar do qual apresentava um crescimento progressivo marcado das lesões. Tinha um gato doméstico que ela nunca tinha levado ao veterinário. O exame físico revelou placas em forma de anel com uma borda periférica activa na parte superior das costas e nos aspectos laterais do pescoço e da face. Cultura local obtida por raspagem de tapete isolado Trichophyton rubrum. Com base em dados clínicos e cultura, foi estabelecido o diagnóstico de tinea incognito."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -85,11 +83,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx") -.setInputCols("document") +.setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() -.setInputCols("sentence") +.setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_living_species", "pt", "clinical/models") @@ -102,7 +100,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, diff --git a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_anatem_en_3_0.md b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_anatem_en_3_0.md index dc6dcde07411e..420d8738e7ff5 100644 --- a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_anatem_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_anatem_en_3_0.md @@ -68,11 +68,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Malignant cells often display defects in autophagy, an evolutionarily conserved pathway for degrading long-lived proteins and cytoplasmic organelles. However, as yet, there is no genetic evidence for a role of autophagy genes in tumor suppression. The beclin 1 autophagy gene is monoallelically deleted in 40 - 75 % of cases of human sporadic breast, ovarian, and prostate cancer."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -80,11 +78,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_anatem", "en", "clinical/models") @@ -97,7 +95,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, @@ -105,7 +103,7 @@ val pipeline = new PipelineModel().setStages(Array(document_assembler, val data = Seq("""Malignant cells often display defects in autophagy, an evolutionarily conserved pathway for degrading long-lived proteins and cytoplasmic organelles. However, as yet, there is no genetic evidence for a role of autophagy genes in tumor suppression. The beclin 1 autophagy gene is monoallelically deleted in 40 - 75 % of cases of human sporadic breast, ovarian, and prostate cancer.""").toDS.toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc2gm_gene_en_3_0.md b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc2gm_gene_en_3_0.md index 648ad5966a458..17e35e57cb939 100644 --- a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc2gm_gene_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc2gm_gene_en_3_0.md @@ -68,11 +68,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""ROCK-I, Kinectin, and mDia2 can bind the wild type forms of both RhoA and Cdc42 in a GTP-dependent manner in vitro. These results support the hypothesis that in the presence of tryptophan the ribosome translating tnaC blocks Rho ' s access to the boxA and rut sites, thereby preventing transcription termination."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -80,11 +78,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_bc2gm_gene", "en", "clinical/models") @@ -97,7 +95,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, @@ -105,7 +103,7 @@ val pipeline = new PipelineModel().setStages(Array(document_assembler, val data = Seq("""ROCK-I, Kinectin, and mDia2 can bind the wild type forms of both RhoA and Cdc42 in a GTP-dependent manner in vitro. These results support the hypothesis that in the presence of tryptophan the ribosome translating tnaC blocks Rho ' s access to the boxA and rut sites, thereby preventing transcription termination.""").toDS.toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc4chemd_chemicals_en_3_0.md b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc4chemd_chemicals_en_3_0.md index f1baf3b5fdd65..b69406e3fc18b 100644 --- a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc4chemd_chemicals_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc4chemd_chemicals_en_3_0.md @@ -68,11 +68,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""The main isolated compounds were triterpenes (alpha - amyrin, beta - amyrin, lupeol, betulin, betulinic acid, uvaol, erythrodiol and oleanolic acid) and phenolic acid derivatives from 4 - hydroxybenzoic acid (gallic and protocatechuic acids and isocorilagin)."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -80,11 +78,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_bc4chemd_chemicals", "en", "clinical/models") @@ -97,7 +95,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, @@ -105,7 +103,7 @@ val pipeline = new PipelineModel().setStages(Array(document_assembler, val data = Seq("""The main isolated compounds were triterpenes (alpha - amyrin, beta - amyrin, lupeol, betulin, betulinic acid, uvaol, erythrodiol and oleanolic acid) and phenolic acid derivatives from 4 - hydroxybenzoic acid (gallic and protocatechuic acids and isocorilagin).""").toDS.toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc5cdr_chemicals_en_3_0.md b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc5cdr_chemicals_en_3_0.md index b3428c04f9dd8..fd0273034f636 100644 --- a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc5cdr_chemicals_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc5cdr_chemicals_en_3_0.md @@ -68,11 +68,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""The possibilities that these cardiovascular findings might be the result of non-selective inhibition of monoamine oxidase or of amphetamine and metamphetamine are discussed. The results have shown that the degradation product p-choloroaniline is not a significant factor in chlorhexidine-digluconate associated erosive cystitis. A high percentage of kanamycin - colistin and povidone-iodine irrigations were associated with erosive cystitis and suggested a possible complication with human usage."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -80,11 +78,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_bc5cdr_chemicals", "en", "clinical/models") @@ -97,7 +95,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, @@ -105,7 +103,7 @@ val pipeline = new PipelineModel().setStages(Array(document_assembler, val data = Seq("""The possibilities that these cardiovascular findings might be the result of non-selective inhibition of monoamine oxidase or of amphetamine and metamphetamine are discussed. The results have shown that the degradation product p-choloroaniline is not a significant factor in chlorhexidine-digluconate associated erosive cystitis. A high percentage of kanamycin - colistin and povidone-iodine irrigations were associated with erosive cystitis and suggested a possible complication with human usage.""").toDS.toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc5cdr_disease_en_3_0.md b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc5cdr_disease_en_3_0.md index 812fefed45e31..7f0643c76b82c 100644 --- a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc5cdr_disease_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_bc5cdr_disease_en_3_0.md @@ -68,11 +68,10 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) data = spark.createDataFrame([["""Indomethacin resulted in histopathologic findings typical of interstitial cystitis, such as leaky bladder epithelium and mucosal mastocytosis. The true incidence of nonsteroidal anti-inflammatory drug-induced cystitis in humans must be clarified by prospective clinical trials. An open-label phase II study of low-dose thalidomide in androgen-independent prostate cancer."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -80,11 +79,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_bc5cdr_disease", "en", "clinical/models") @@ -97,7 +96,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, @@ -105,7 +104,7 @@ val pipeline = new PipelineModel().setStages(Array(document_assembler, val data = Seq("""Indomethacin resulted in histopathologic findings typical of interstitial cystitis, such as leaky bladder epithelium and mucosal mastocytosis. The true incidence of nonsteroidal anti-inflammatory drug-induced cystitis in humans must be clarified by prospective clinical trials. An open-label phase II study of low-dose thalidomide in androgen-independent prostate cancer.""").toDS.toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_jnlpba_cellular_en_3_0.md b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_jnlpba_cellular_en_3_0.md index 2ee96462881ad..7d67aa30e0441 100644 --- a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_jnlpba_cellular_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_jnlpba_cellular_en_3_0.md @@ -66,11 +66,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""The results suggest that activation of protein kinase C, but not new protein synthesis, is required for IL-2 induction of IFN-gamma and GM-CSF cytoplasmic mRNA. It also was observed that suppression of cytokine gene expression by these agents was independent of the inhibition of proliferation. These data indicate that IL-2 and IL-12 may have distinct signaling pathways leading to the induction of IFN-gammaand GM-CSFgene expression, andthatthe NK3.3 cell line may serve as a novel model for dissecting the biochemical and molecular events involved in these pathways. A functional T-cell receptor signaling pathway is required for p95vav activity. Stimulation of the T-cell antigen receptor ( TCR ) induces activation of multiple tyrosine kinases, resulting in phosphorylation of numerous intracellular substrates. One substrate is p95vav, which is expressed exclusively in hematopoietic and trophoblast cells."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -78,11 +76,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_jnlpba_cellular", "en", "clinical/models") @@ -95,7 +93,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, @@ -103,7 +101,7 @@ val pipeline = new PipelineModel().setStages(Array(document_assembler, val data = Seq("""The results suggest that activation of protein kinase C, but not new protein synthesis, is required for IL-2 induction of IFN-gamma and GM-CSF cytoplasmic mRNA. It also was observed that suppression of cytokine gene expression by these agents was independent of the inhibition of proliferation. These data indicate that IL-2 and IL-12 may have distinct signaling pathways leading to the induction of IFN-gammaand GM-CSFgene expression, andthatthe NK3.3 cell line may serve as a novel model for dissecting the biochemical and molecular events involved in these pathways. A functional T-cell receptor signaling pathway is required for p95vav activity. Stimulation of the T-cell antigen receptor ( TCR ) induces activation of multiple tyrosine kinases, resulting in phosphorylation of numerous intracellular substrates. One substrate is p95vav, which is expressed exclusively in hematopoietic and trophoblast cells.""").toDS.toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_linnaeus_species_en_3_0.md b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_linnaeus_species_en_3_0.md index 99efc61e9edfa..26b12c9c1f28a 100644 --- a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_linnaeus_species_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_linnaeus_species_en_3_0.md @@ -66,11 +66,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""First identified in chicken, vigilin homologues have now been found in human (6), Xenopus laevis (7), Drosophila melanogaster (8) and Schizosaccharomyces pombe."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -78,11 +76,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_linnaeus_species", "en", "clinical/models") @@ -95,7 +93,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, @@ -103,7 +101,7 @@ val pipeline = new PipelineModel().setStages(Array(document_assembler, val data = Seq("""First identified in chicken, vigilin homologues have now been found in human (6), Xenopus laevis (7), Drosophila melanogaster (8) and Schizosaccharomyces pombe.""").toDS.toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_ncbi_disease_en_3_0.md b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_ncbi_disease_en_3_0.md index 9e671afe33ff2..b0d30437233f8 100644 --- a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_ncbi_disease_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_ncbi_disease_en_3_0.md @@ -66,11 +66,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Kniest dysplasia is a moderately severe type II collagenopathy, characterized by short trunk and limbs, kyphoscoliosis, midface hypoplasia, severe myopia, and hearing loss."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -78,11 +76,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_ncbi_disease", "en", "clinical/models") @@ -95,7 +93,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, @@ -103,7 +101,7 @@ val pipeline = new PipelineModel().setStages(Array(document_assembler, val data = Seq("""Kniest dysplasia is a moderately severe type II collagenopathy, characterized by short trunk and limbs, kyphoscoliosis, midface hypoplasia, severe myopia, and hearing loss.""").toDS.toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_species_en_3_0.md b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_species_en_3_0.md index cf1a28a8e7b53..537a5be3d360a 100644 --- a/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_species_en_3_0.md +++ b/docs/_posts/bunyamin-polat/2022-07-25-bert_token_classifier_ner_species_en_3_0.md @@ -68,11 +68,9 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""As determined by 16S rRNA gene sequence analysis, strain 6C (T) represents a distinct species belonging to the class Betaproteobacteria and is most closely related to Thiomonas intermedia DSM 18155 (T) and Thiomonas perometabolis DSM 18570 (T) ."""]]).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -80,11 +78,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val ner_model = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_species", "en", "clinical/models") @@ -97,7 +95,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array(document_assembler, +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, ner_model, @@ -105,7 +103,7 @@ val pipeline = new PipelineModel().setStages(Array(document_assembler, val data = Seq("""As determined by 16S rRNA gene sequence analysis, strain 6C (T) represents a distinct species belonging to the class Betaproteobacteria and is most closely related to Thiomonas intermedia DSM 18155 (T) and Thiomonas perometabolis DSM 18570 (T) .""").toDS.toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_de.md b/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_de.md index b831fab9f8d59..8ac17b33d45e4 100644 --- a/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_de.md +++ b/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_de.md @@ -56,11 +56,9 @@ clf_pipeline = Pipeline(stages=[ clf_model ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Sachverhalt: A. Mit Strafbefehl vom 30. Juli 2015 sprach die Staatsanwaltschaft Lenzburg-Aarau gegen X._ eine bedingte Geldstrafe von 150 Tagessätzen zu Fr. 150.-- (Probezeit vier Jahre) sowie eine Busse von Fr. 4'500.-- aus wegen Führens eines Motorfahrzeugs in angetrunkenem Zustand sowie wegen mehrfacher Anstiftung zu falschem Zeugnis. Die Staatsanwaltschaft legte X._ unter anderem zur Last, am 5. Juli 2013 nach Aussage von Zeugen sein Auto mit einem Blutalkoholgehalt von mindestens 2,12 Promille bestiegen und von Lenzburg an seinen Wohnort in Z._ gelenkt zu haben. Das nach Einsprache von X._ mit der Sache befasste Bezirksgericht Lenzburg sprach ihn vom Vorwurf der mehrfachen Anstiftung zu falschem Zeugnis frei und verurteilte ihn wegen Führens eines Motorfahrzeugs in angetrunkenem Zustand zu einer bedingten Geldstrafe von 105 Tagessätzen zu Fr. 210.-- (Probezeit zwei Jahre) und zu einer Busse von Fr. 4'400.-- (Urteil vom 15. August 2016). B. X._ erhob Berufung. Das Obergericht des Kantons Aargau wies das Rechtsmittel ab (Urteil vom 3. Juli 2017). C. Mit Beschwerde in Strafsachen beantragt X._, das angefochtene Urteil sei aufzuheben und er von Schuld und Strafe freizusprechen."""]]).toDF("text") -result = model.transform(data) +result = clf_pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_en.md b/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_en.md index 8a0861b08d074..50022cc52ff67 100644 --- a/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_en.md +++ b/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_en.md @@ -55,11 +55,10 @@ clf_pipeline = Pipeline(stages=[ tokenizer, clf_model ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) data = spark.createDataFrame([["""Facts of fact: A. The Canton Police arrested X._ on 2. January 2007 due to suspicion of having committed an intrusive bull. In the trial of the trial 3. In January 2007, he agreed to have, together with a complicient, carried out a rubbish steel in a Jeans store in the fountain. After that, the investigative judge opened to him orally, he took him into investigative detention for the risk of collusion and continuation. X._ renounced a written and justified order, but desired a review of the investigation by the president of the Canton Court. by 4. In January 2007, the investigative judge submitted the documents to the president of the Canton Court with the request to withdraw the complaint and maintain the investigative detention. X._ requested to withdraw the investigative detention and immediately release him into freedom. He may be released under conditions or conditions. At its disposal of 5. In January 2007, the president of the Canton Court stated that the urgent offence was suspected in relation to the authorized invasion of the Jeans business and other invasions already occurred during a previous imprisonment. The risk of collusion is not accepted, but the recurrence forecast is extremely disadvantaged, therefore there is a risk of continuation. This is the request of the investigative judge - this is according to the instructions of 23. May 2006 (GG 2006 2; www.kgsz.ch) was not authorized to order investigative detention - to carry out and to confirm the investigative detention. At its disposal of 5. In January 2007, the president of the Canton Court stated that the urgent offence was suspected in relation to the authorized invasion of the Jeans business and other invasions already occurred during a previous imprisonment. The risk of collusion is not accepted, but the recurrence forecast is extremely disadvantaged, therefore there is a risk of continuation. This is the request of the investigative judge - this is according to the instructions of 23. May 2006 (GG 2006 2; www.kgsz.ch) was not authorized to order investigative detention - to carry out and to confirm the investigative detention. B. With complaint in criminal cases of 5. February 2007 requested X._: 1. It should be noted that the order GP 2007 3 of the Canton Court President of the Canton of Schwyz of 5. January 2007 is invalid and the complainant must be immediately released from prison. 2nd Eventually the order GP 2007 3 of the Canton Court President of the Canton of Schwyz of 5. January 2007 shall be repealed and the complainant shall be immediately released from investigative detention. and 3. Subeventual is the complainant due to the violation of the cantonal Swiss law by the instructions of the Canton Court of Schwyz of 23. May 2006 immediately released from the detention. Fourth All under cost and compensation consequences at the expense of the complainant.” Fourth All under cost and compensation consequences at the expense of the complainant.” C. The investigative judge requires in his judgment that “there must be established that the investigative detention was ordered by the investigative authority in accordance with the law and that the appeal submitted by the Court of Appeal with the approval of the request for responsibility and the confirmation of the investigative detention (Decree of the President of the Canton Court of 5 January 2007) has been legally rejected.” Insofar as X._ requires his immediate release, the complaint must be rejected. The President of the Canton Court asks to reject the complaint insofar as it is necessary. X._ requires unpaid legal assistance and defence and completes in its response to the complaint."""]]).toDF("text") -result = model.transform(data) +result = clf_pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_fr.md b/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_fr.md index aa751112b3b95..bcac3d2796f4b 100644 --- a/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_fr.md +++ b/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_fr.md @@ -56,11 +56,9 @@ clf_pipeline = Pipeline(stages=[ clf_model ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Résumé : A. X. 1948) et Z. Ils se sont mariés à la xxxx 1992. Le mariage est resté sans enfants. T._ est, cependant, le père des enfants divorcés S._ et T._ (geb. 2004 et 2006). Après la suppression du budget commun, la vie séparée a dû être réglée. Disponible du 17. En décembre 2010, le président de la Cour de justice, Dorneck-Thierstein, a autorisé les époux à se séparer. Dans la mesure où cela est encore important, le juge a obligé le mari, pour l'année 2010 encore Fr. 3'000.-- à payer l'entretien de sa femme (Ziff. 3 ) De même, Z._ a été condamné, X._ à partir de janvier 2011 pour la durée ultérieure de la séparation une contribution de subsistance mensuelle de Fr. 7'085.-- de vous dépenser et de vous payer, en outre, la moitié du bonus net versé à chacun immédiatement après sa destination (Ziff. 4 ) En outre, le président de la Cour a ordonné la séparation des marchandises (Ziff. 5), dispose de la compétition du parti ou Les frais d’avocat (Ziff. 9) et impose les frais judiciaires à la moitié des deux parties (Ziff. 10 ) B. À l’encontre de cette décision, X._ a fait appel à la Cour suprême du canton de Solothurn. Elle a demandé de supprimer les paragraphes 3, 4, 5, 9 et 10 de la décision de première instance, et a présenté les demandes juridiques suivantes: Le mari est tenu de l'engager pour la période à partir de 21. Septembre 2009 à la fin du mois de décembre 2010 une contribution supplémentaire de Fr. 34'400.-- pour rembourser; pour la vie séparée à partir de janvier 2011, elle est dotée d'une contribution de subsistance de Fr. 10'000.-- pour recevoir par mois. La distribution des marchandises est de 21. Déposer en septembre 2010. En conclusion, le conjoint doit payer une contribution de parti raisonnable d'au moins Fr. 6'000.-- et pour payer tous les frais de justice. La Cour suprême du canton de Solothurn a déposé le recours à l'arrêt du 18. en mai 2011. C. À ce titre, X._ (ci-après dénommée « plaignante ») procède à la Cour fédérale. Dans sa plainte du 20. En juin 2011, elle présente la demande, la décision de la Cour suprême du canton Solothurn du 18. annuler en mai 2011 et répéter les demandes légales qu’elle a présentées devant la Cour suprême (cf. Bst. B ) En outre, il demande que la séparation des marchandises soit plus égalitaire par 7. Décembre 2010 à ordonner. Aucune consultation n’a été faite, mais les actes préjudiciels ont été reçus."""]]).toDF("text") -result = model.transform(data) +result = clf_pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_it.md b/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_it.md index ef2fb511348b4..e560e1643fefd 100644 --- a/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_it.md +++ b/docs/_posts/bunyamin-polat/2022-10-25-legclf_bert_swiss_judgements_it.md @@ -56,11 +56,9 @@ clf_pipeline = Pipeline(stages=[ clf_model ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Attualità: A. Disponibile dal 21. Nell'ottobre del 2004, l'Allianza di assicurazioni svizzere (in prosieguo: Allianz) ha messo in atto il R._ (geb. 1965) per le conseguenze di un incidente del 23. Nel mese di marzo del 2001 le prestazioni sono ritornate al 31. Nel mese di marzo del 2004 si è presentato la decisione del 6. Nel luglio del 2005 è stato arrestato. A. A disposizione del 21. Nell'ottobre del 2004, l'Allianza di assicurazioni svizzere (in prosieguo: Allianz) ha messo in atto il R._ (geb. 1965) per le conseguenze di un incidente del 23. Nel mese di marzo del 2001 le prestazioni sono ritornate al 31. Nel mese di marzo del 2004 si è presentato la decisione del 6. Nel luglio del 2005 è stato arrestato. di B. Il 7. Nel novembre 2005 R._ ha presentato una denuncia contro la decisione di interrogatorio al Tribunale amministrativo del Cantone di Schwyz. Con la lettera del 9. Nel novembre del 2005, il vicepresidente del Tribunale amministrativo ha informato gli assicurati che la denuncia è stata presentata in ritardo secondo la legge cantonale massiccia, il motivo per cui non è possibile procedere, e gli ha dato l'opportunità di pronunciarsi. Con l’ingresso del 15. Nel novembre 2005 R._ ha presentato una richiesta di ripristino del termine di reclamo. Con la decisione del 6. Nel dicembre 2005 il Tribunale amministrativo non ha presentato la denuncia. di B. Il 7. Nel novembre 2005 R._ ha presentato una denuncia contro la decisione di interrogatorio al Tribunale amministrativo del Cantone di Schwyz. Con la lettera del 9. Nel novembre del 2005, il vicepresidente del Tribunale amministrativo ha informato gli assicurati che la denuncia è stata presentata in ritardo secondo la legge cantonale massiccia, il motivo per cui non è possibile procedere, e gli ha dato l'opportunità di pronunciarsi. Con l’ingresso del 15. Nel novembre 2005 R._ ha presentato una richiesta di ripristino del termine di reclamo. Con la decisione del 6. Nel dicembre 2005 il Tribunale amministrativo non ha presentato la denuncia. C. Con un ricorso al Tribunale amministrativo, R._ chiede alla causa principale che, annullando la decisione pregiudiziale, il tribunale cantonale sia obbligato a presentare il ricorso del 7. di entrare nel novembre 2005. Dal punto di vista procedurale, il giudice può presentare la richiesta giuridica di aderire agli atti pregiudiziali e di ordinare un secondo cambio di scrittura. Il Tribunale amministrativo del Cantone di Schwyz e l'Alleanza concludono il ricorso alla Corte amministrativa. L’Ufficio federale per la salute rinuncia ad una consultazione."""]]).toDF("text") -result = model.transform(data) +result = clf_pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-10-25-legner_indian_court_judgement_en.md b/docs/_posts/bunyamin-polat/2022-10-25-legner_indian_court_judgement_en.md index 5594f8f888ba0..35ac2ccf6e652 100644 --- a/docs/_posts/bunyamin-polat/2022-10-25-legner_indian_court_judgement_en.md +++ b/docs/_posts/bunyamin-polat/2022-10-25-legner_indian_court_judgement_en.md @@ -72,8 +72,6 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Let fresh bailable warrant of Rs.20,000/- (Rupees Twenty Thousand) be issued through Superintendent of Police, Dhar to the respondents No.1 Sikandar and No.2 Aziz for a date to be fixed by the Registry to secure the presence of the respondents No.1 and 2, made returnable within six weeks. P.K.Jaiswal) Judge (Jarat Kumar Jain) Judge ns. @@ -81,7 +79,7 @@ W.P.No.1361/2013 14/12/2015 Parties through their Counsel."""]]) -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -90,11 +88,11 @@ val document_assembler = new DocumentAssembler() .setCleanupMode("shrink") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") @@ -111,7 +109,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array( +val pipeline = new Pipeline().setStages(Array( document_assembler, sentence_detector, tokenizer, diff --git a/docs/_posts/bunyamin-polat/2022-10-25-legner_indian_court_preamble_en.md b/docs/_posts/bunyamin-polat/2022-10-25-legner_indian_court_preamble_en.md index 56c0543a27468..ed02f8e654c2d 100644 --- a/docs/_posts/bunyamin-polat/2022-10-25-legner_indian_court_preamble_en.md +++ b/docs/_posts/bunyamin-polat/2022-10-25-legner_indian_court_preamble_en.md @@ -72,7 +72,6 @@ pipeline = Pipeline(stages=[ ner_converter ]) -model = pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) data = spark.createDataFrame([["""In The High Court Of Judicature At Madras @@ -107,7 +106,7 @@ For Petitioner : Mr. K.Sudarsanam for M/s. Surithi Associates For Respondents: Mr. Mohd.Fiary Hussain for R1"""]]) -result = model.transform(data) +result = pipeline.fit(data).transform(data) ``` ```scala val document_assembler = new DocumentAssembler() @@ -116,11 +115,11 @@ val document_assembler = new DocumentAssembler() .setCleanupMode("shrink") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl", "en") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") @@ -137,7 +136,7 @@ val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") -val pipeline = new PipelineModel().setStages(Array( +val pipeline = new Pipeline().setStages(Array( document_assembler, sentence_detector, tokenizer, diff --git a/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_de.md b/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_de.md index 21d9db2e7bf80..81d1d8d8618f7 100644 --- a/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_de.md +++ b/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_de.md @@ -56,11 +56,9 @@ clf_pipeline = Pipeline(stages=[ clf_model ]) -model = clf_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Sachverhalt: A. Mit Strafbefehl vom 30. Juli 2015 sprach die Staatsanwaltschaft Lenzburg-Aarau gegen X._ eine bedingte Geldstrafe von 150 Tagessätzen zu Fr. 150.-- (Probezeit vier Jahre) sowie eine Busse von Fr. 4'500.-- aus wegen Führens eines Motorfahrzeugs in angetrunkenem Zustand sowie wegen mehrfacher Anstiftung zu falschem Zeugnis. Die Staatsanwaltschaft legte X._ unter anderem zur Last, am 5. Juli 2013 nach Aussage von Zeugen sein Auto mit einem Blutalkoholgehalt von mindestens 2,12 Promille bestiegen und von Lenzburg an seinen Wohnort in Z._ gelenkt zu haben. Das nach Einsprache von X._ mit der Sache befasste Bezirksgericht Lenzburg sprach ihn vom Vorwurf der mehrfachen Anstiftung zu falschem Zeugnis frei und verurteilte ihn wegen Führens eines Motorfahrzeugs in angetrunkenem Zustand zu einer bedingten Geldstrafe von 105 Tagessätzen zu Fr. 210.-- (Probezeit zwei Jahre) und zu einer Busse von Fr. 4'400.-- (Urteil vom 15. August 2016). B. X._ erhob Berufung. Das Obergericht des Kantons Aargau wies das Rechtsmittel ab (Urteil vom 3. Juli 2017). C. Mit Beschwerde in Strafsachen beantragt X._, das angefochtene Urteil sei aufzuheben und er von Schuld und Strafe freizusprechen."""]]).toDF("text") -result = model.transform(data) +result = clf_pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_en.md b/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_en.md index 29d2a02544f74..7f290f1b76c79 100644 --- a/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_en.md +++ b/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_en.md @@ -55,11 +55,10 @@ clf_pipeline = Pipeline(stages=[ tokenizer, clf_model ]) -model = clf_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) data = spark.createDataFrame([["""Facts of fact: A. The Canton Police arrested X._ on 2. January 2007 due to suspicion of having committed an intrusive bull. In the trial of the trial 3. In January 2007, he agreed to have, together with a complicient, carried out a rubbish steel in a Jeans store in the fountain. After that, the investigative judge opened to him orally, he took him into investigative detention for the risk of collusion and continuation. X._ renounced a written and justified order, but desired a review of the investigation by the president of the Canton Court. by 4. In January 2007, the investigative judge submitted the documents to the president of the Canton Court with the request to withdraw the complaint and maintain the investigative detention. X._ requested to withdraw the investigative detention and immediately release him into freedom. He may be released under conditions or conditions. At its disposal of 5. In January 2007, the president of the Canton Court stated that the urgent offence was suspected in relation to the authorized invasion of the Jeans business and other invasions already occurred during a previous imprisonment. The risk of collusion is not accepted, but the recurrence forecast is extremely disadvantaged, therefore there is a risk of continuation. This is the request of the investigative judge - this is according to the instructions of 23. May 2006 (GG 2006 2; www.kgsz.ch) was not authorized to order investigative detention - to carry out and to confirm the investigative detention. At its disposal of 5. In January 2007, the president of the Canton Court stated that the urgent offence was suspected in relation to the authorized invasion of the Jeans business and other invasions already occurred during a previous imprisonment. The risk of collusion is not accepted, but the recurrence forecast is extremely disadvantaged, therefore there is a risk of continuation. This is the request of the investigative judge - this is according to the instructions of 23. May 2006 (GG 2006 2; www.kgsz.ch) was not authorized to order investigative detention - to carry out and to confirm the investigative detention. B. With complaint in criminal cases of 5. February 2007 requested X._: 1. It should be noted that the order GP 2007 3 of the Canton Court President of the Canton of Schwyz of 5. January 2007 is invalid and the complainant must be immediately released from prison. 2nd Eventually the order GP 2007 3 of the Canton Court President of the Canton of Schwyz of 5. January 2007 shall be repealed and the complainant shall be immediately released from investigative detention. and 3. Subeventual is the complainant due to the violation of the cantonal Swiss law by the instructions of the Canton Court of Schwyz of 23. May 2006 immediately released from the detention. Fourth All under cost and compensation consequences at the expense of the complainant.” Fourth All under cost and compensation consequences at the expense of the complainant.” C. The investigative judge requires in his judgment that “there must be established that the investigative detention was ordered by the investigative authority in accordance with the law and that the appeal submitted by the Court of Appeal with the approval of the request for responsibility and the confirmation of the investigative detention (Decree of the President of the Canton Court of 5 January 2007) has been legally rejected.” Insofar as X._ requires his immediate release, the complaint must be rejected. The President of the Canton Court asks to reject the complaint insofar as it is necessary. X._ requires unpaid legal assistance and defence and completes in its response to the complaint."""]]).toDF("text") -result = model.transform(data) +result = clf_pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_fr.md b/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_fr.md index 0377ae614120c..a18d9fdc02337 100644 --- a/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_fr.md +++ b/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_fr.md @@ -56,11 +56,9 @@ clf_pipeline = Pipeline(stages=[ clf_model ]) -model = clf_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Résumé : A. X. 1948) et Z. Ils se sont mariés à la xxxx 1992. Le mariage est resté sans enfants. T._ est, cependant, le père des enfants divorcés S._ et T._ (geb. 2004 et 2006). Après la suppression du budget commun, la vie séparée a dû être réglée. Disponible du 17. En décembre 2010, le président de la Cour de justice, Dorneck-Thierstein, a autorisé les époux à se séparer. Dans la mesure où cela est encore important, le juge a obligé le mari, pour l'année 2010 encore Fr. 3'000.-- à payer l'entretien de sa femme (Ziff. 3 ) De même, Z._ a été condamné, X._ à partir de janvier 2011 pour la durée ultérieure de la séparation une contribution de subsistance mensuelle de Fr. 7'085.-- de vous dépenser et de vous payer, en outre, la moitié du bonus net versé à chacun immédiatement après sa destination (Ziff. 4 ) En outre, le président de la Cour a ordonné la séparation des marchandises (Ziff. 5), dispose de la compétition du parti ou Les frais d’avocat (Ziff. 9) et impose les frais judiciaires à la moitié des deux parties (Ziff. 10 ) B. À l’encontre de cette décision, X._ a fait appel à la Cour suprême du canton de Solothurn. Elle a demandé de supprimer les paragraphes 3, 4, 5, 9 et 10 de la décision de première instance, et a présenté les demandes juridiques suivantes: Le mari est tenu de l'engager pour la période à partir de 21. Septembre 2009 à la fin du mois de décembre 2010 une contribution supplémentaire de Fr. 34'400.-- pour rembourser; pour la vie séparée à partir de janvier 2011, elle est dotée d'une contribution de subsistance de Fr. 10'000.-- pour recevoir par mois. La distribution des marchandises est de 21. Déposer en septembre 2010. En conclusion, le conjoint doit payer une contribution de parti raisonnable d'au moins Fr. 6'000.-- et pour payer tous les frais de justice. La Cour suprême du canton de Solothurn a déposé le recours à l'arrêt du 18. en mai 2011. C. À ce titre, X._ (ci-après dénommée « plaignante ») procède à la Cour fédérale. Dans sa plainte du 20. En juin 2011, elle présente la demande, la décision de la Cour suprême du canton Solothurn du 18. annuler en mai 2011 et répéter les demandes légales qu’elle a présentées devant la Cour suprême (cf. Bst. B ) En outre, il demande que la séparation des marchandises soit plus égalitaire par 7. Décembre 2010 à ordonner. Aucune consultation n’a été faite, mais les actes préjudiciels ont été reçus."""]]).toDF("text") -result = model.transform(data) +result = clf_pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_it.md b/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_it.md index 3ddeab5585fd8..a4fb25d592e37 100644 --- a/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_it.md +++ b/docs/_posts/bunyamin-polat/2022-10-27-legclf_bert_swiss_judgements_it.md @@ -56,11 +56,9 @@ clf_pipeline = Pipeline(stages=[ clf_model ]) -model = clf_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) - data = spark.createDataFrame([["""Attualità: A. Disponibile dal 21. Nell'ottobre del 2004, l'Allianza di assicurazioni svizzere (in prosieguo: Allianz) ha messo in atto il R._ (geb. 1965) per le conseguenze di un incidente del 23. Nel mese di marzo del 2001 le prestazioni sono ritornate al 31. Nel mese di marzo del 2004 si è presentato la decisione del 6. Nel luglio del 2005 è stato arrestato. A. A disposizione del 21. Nell'ottobre del 2004, l'Allianza di assicurazioni svizzere (in prosieguo: Allianz) ha messo in atto il R._ (geb. 1965) per le conseguenze di un incidente del 23. Nel mese di marzo del 2001 le prestazioni sono ritornate al 31. Nel mese di marzo del 2004 si è presentato la decisione del 6. Nel luglio del 2005 è stato arrestato. di B. Il 7. Nel novembre 2005 R._ ha presentato una denuncia contro la decisione di interrogatorio al Tribunale amministrativo del Cantone di Schwyz. Con la lettera del 9. Nel novembre del 2005, il vicepresidente del Tribunale amministrativo ha informato gli assicurati che la denuncia è stata presentata in ritardo secondo la legge cantonale massiccia, il motivo per cui non è possibile procedere, e gli ha dato l'opportunità di pronunciarsi. Con l’ingresso del 15. Nel novembre 2005 R._ ha presentato una richiesta di ripristino del termine di reclamo. Con la decisione del 6. Nel dicembre 2005 il Tribunale amministrativo non ha presentato la denuncia. di B. Il 7. Nel novembre 2005 R._ ha presentato una denuncia contro la decisione di interrogatorio al Tribunale amministrativo del Cantone di Schwyz. Con la lettera del 9. Nel novembre del 2005, il vicepresidente del Tribunale amministrativo ha informato gli assicurati che la denuncia è stata presentata in ritardo secondo la legge cantonale massiccia, il motivo per cui non è possibile procedere, e gli ha dato l'opportunità di pronunciarsi. Con l’ingresso del 15. Nel novembre 2005 R._ ha presentato una richiesta di ripristino del termine di reclamo. Con la decisione del 6. Nel dicembre 2005 il Tribunale amministrativo non ha presentato la denuncia. C. Con un ricorso al Tribunale amministrativo, R._ chiede alla causa principale che, annullando la decisione pregiudiziale, il tribunale cantonale sia obbligato a presentare il ricorso del 7. di entrare nel novembre 2005. Dal punto di vista procedurale, il giudice può presentare la richiesta giuridica di aderire agli atti pregiudiziali e di ordinare un secondo cambio di scrittura. Il Tribunale amministrativo del Cantone di Schwyz e l'Alleanza concludono il ricorso alla Corte amministrativa. L’Ufficio federale per la salute rinuncia ad una consultazione."""]]).toDF("text") -result = model.transform(data) +result = clf_pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/dcecchini/2021-01-29-ner_drugs_large_en.md b/docs/_posts/dcecchini/2021-01-29-ner_drugs_large_en.md index 0a16cfcc631cd..72fe3c223dc9d 100644 --- a/docs/_posts/dcecchini/2021-01-29-ner_drugs_large_en.md +++ b/docs/_posts/dcecchini/2021-01-29-ner_drugs_large_en.md @@ -1,150 +1,151 @@ ---- -layout: model -title: Detect Drug Chemicals -author: John Snow Labs -name: ner_drugs_large_en -date: 2021-01-29 -task: Named Entity Recognition -language: en -edition: Healthcare NLP 2.7.1 -spark_version: 2.4 -tags: [ner, en, licensed, clinical] -supported: true -article_header: - type: cover -use_language_switcher: "Python-Scala-Java" ---- -{:.h2_title} -## Description - -Pretrained named entity recognition deep learning model for Drugs. The model combines dosage, strength, form, and route into a single entity: Drug. The SparkNLP deep learning model (NerDL) is inspired by a former state of the art model for NER: Chiu & Nicols, Named Entity Recognition with Bidirectional LSTM-CNN. - -{:.h2_title} -## Predicted Entities -`DRUG` - - -{:.btn-box} -[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_POSOLOGY/){:.button.button-orange} -[Open in Colab](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_drugs_large_en_2.6.0_2.4_1603915964112.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - - -## How to use - -Use as part of an nlp pipeline with the following stages: DocumentAssembler, SentenceDetector, Tokenizer, WordEmbeddingsModel, NerDLModel. Add the NerConverter to the end of the pipeline to convert entity tokens into full entity chunks. - -
- -{% include programmingLanguageSelectScalaPython.html %} - - -```python -documentAssembler = DocumentAssembler()\ - .setInputCol("text")\ - .setOutputCol("document") - -sentenceDetector = SentenceDetector()\ - .setInputCols(["document"])\ - .setOutputCol("sentence") - -tokenizer = Tokenizer()\ - .setInputCols(["sentence"])\ - .setOutputCol("token") - -# Clinical word embeddings trained on PubMED dataset -word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ - .setInputCols(["sentence", "token"])\ - .setOutputCol("embeddings") - -clinical_ner = NerDLModel.pretrained("ner_drugs_large", "en", "clinical/models") \ - .setInputCols(["sentence", "token", "embeddings"]) \ - .setOutputCol("ner") - -ner_converter = NerConverter() \ - .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") - -nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter]) - -model = nlpPipeline.fit(spark.createDataFrame(["The patient is a 40-year-old white male who presents with a chief complaint of "chest pain". The patient is diabetic and has a prior history of coronary artery disease. The patient presents today stating that his chest pain started yesterday evening and has been somewhat intermittent. He has been advised Aspirin 81 milligrams QDay. Humulin N. insulin 50 units in a.m. HCTZ 50 mg QDay. Nitroglycerin 1/150 sublingually PRN chest pain."]).toDF("text")) - -results = model.transform(data) - -``` - -```scala -val documentAssembler = DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") - -val sentenceDetector = SentenceDetector() - .setInputCols(["document"]) - .setOutputCol("sentence") - -val tokenizer = Tokenizer() - .setInputCols(["sentence"]) - .setOutputCol("token") - -# Clinical word embeddings trained on PubMED dataset -val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") - .setInputCols(["sentence", "token"]) - .setOutputCol("embeddings") - -val ner = NerDLModel.pretrained("ner_drugs_large", "en", "clinical/models") - .setInputCols("sentence", "token", "embeddings") - .setOutputCol("ner") - -val ner_converter = NerConverter() - .setInputCols(["sentence", "token", "ner"]) - .setOutputCol("ner_chunk") - -val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, ner, ner_converter)) - -val data = Seq("The patient is a 40-year-old white male who presents with a chief complaint of "chest pain". The patient is diabetic and has a prior history of coronary artery disease. The patient presents today stating that his chest pain started yesterday evening and has been somewhat intermittent. He has been advised Aspirin 81 milligrams QDay. Humulin N. insulin 50 units in a.m. HCTZ 50 mg QDay. Nitroglycerin 1/150 sublingually PRN chest pain.").toDF("text") -val result = pipeline.fit(data).transform(data) -``` - -
- -{:.h2_title} -## Results -The output is a dataframe with a sentence per row and a ``"ner"`` column containing all of the entity labels in the sentence, entity character indices, and other metadata. - -```bash - -+--------------------------------+---------+ -|chunk |ner_label| -+--------------------------------+---------+ -|Aspirin 81 milligrams |DRUG | -|Humulin N |DRUG | -|insulin 50 units |DRUG | -|HCTZ 50 mg |DRUG | -|Nitroglycerin 1/150 sublingually|DRUG | -+--------------------------------+---------+ -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|ner_drugs_large_en_2.6.0_2.4| -|Type:|ner| -|Compatibility:|Spark NLP 2.7.1+| -|Edition:|Official| -|License:|Licensed| -|Input Labels:|[sentence,token, embeddings]| -|Output Labels:|[ner]| -|Language:|[en]| -|Case sensitive:|false| - -{:.h2_title} -## Data Source -Trained on i2b2_med7 + FDA with 'embeddings_clinical'. -https://www.i2b2.org/NLP/Medication - -{:.h2_title} -## Benchmarking - -Since this NER model is crafted from `ner_posology` but reduced to single entity, no benchmark is applicable. +--- +layout: model +title: Detect Drug Chemicals +author: John Snow Labs +name: ner_drugs_large_en +date: 2021-01-29 +task: Named Entity Recognition +language: en +edition: Spark NLP for Healthcare 2.7.1 +spark_version: 2.4 +tags: [ner, en, licensed, clinical] +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- +{:.h2_title} +## Description + +Pretrained named entity recognition deep learning model for Drugs. The model combines dosage, strength, form, and route into a single entity: Drug. The SparkNLP deep learning model (NerDL) is inspired by a former state of the art model for NER: Chiu & Nicols, Named Entity Recognition with Bidirectional LSTM-CNN. + +{:.h2_title} +## Predicted Entities +`DRUG` + + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_POSOLOGY/){:.button.button-orange} +[Open in Colab](https://githubtocolab.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/ner_drugs_large_en_2.6.0_2.4_1603915964112.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + + +## How to use + +Use as part of an nlp pipeline with the following stages: DocumentAssembler, SentenceDetector, Tokenizer, WordEmbeddingsModel, NerDLModel. Add the NerConverter to the end of the pipeline to convert entity tokens into full entity chunks. + +
+ +{% include programmingLanguageSelectScalaPython.html %} + + +```python +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +sentence_detector = SentenceDetector()\ + .setInputCols(["document"])\ + .setOutputCol("sentence") + +tokenizer = Tokenizer()\ + .setInputCols(["sentence"])\ + .setOutputCol("token") + +# Clinical word embeddings trained on PubMED dataset +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +clinical_ner = NerDLModel.pretrained("ner_drugs_large", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +nlpPipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, word_embeddings, clinical_ner, ner_converter]) + +data = spark.createDataFrame([["""The patient is a 40-year-old white male who presents with a chief complaint of 'chest pain'. The patient is diabetic and has a prior history of coronary artery disease. The patient presents today stating that his chest pain started yesterday evening and has been somewhat intermittent. He has been advised Aspirin 81 milligrams QDay. Humulin N. insulin 50 units in a.m. HCTZ 50 mg QDay. Nitroglycerin 1/150 sublingually PRN chest pain."""]]).toDF("text") + +result = nlpPipeline.fit(data).transform(data) + +``` + +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence_detector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +# Clinical word embeddings trained on PubMED dataset +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val ner = NerDLModel.pretrained("ner_drugs_large", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, word_embeddings, ner, ner_converter)) + +val data = Seq("""The patient is a 40-year-old white male who presents with a chief complaint of 'chest pain'. The patient is diabetic and has a prior history of coronary artery disease. The patient presents today stating that his chest pain started yesterday evening and has been somewhat intermittent. He has been advised Aspirin 81 milligrams QDay. Humulin N. insulin 50 units in a.m. HCTZ 50 mg QDay. Nitroglycerin 1/150 sublingually PRN chest pain.""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +
+ +{:.h2_title} +## Results +The output is a dataframe with a sentence per row and a ``"ner"`` column containing all of the entity labels in the sentence, entity character indices, and other metadata. + +```bash + ++--------------------------------+---------+ +|chunk |ner_label| ++--------------------------------+---------+ +|Aspirin 81 milligrams |DRUG | +|Humulin N |DRUG | +|insulin 50 units |DRUG | +|HCTZ 50 mg |DRUG | +|Nitroglycerin 1/150 sublingually|DRUG | ++--------------------------------+---------+ +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ner_drugs_large_en_2.6.0_2.4| +|Type:|ner| +|Compatibility:|Spark NLP 2.7.1+| +|Edition:|Official| +|License:|Licensed| +|Input Labels:|[sentence,token, embeddings]| +|Output Labels:|[ner]| +|Language:|[en]| +|Case sensitive:|false| + +{:.h2_title} +## Data Source +Trained on i2b2_med7 + FDA with 'embeddings_clinical'. +https://www.i2b2.org/NLP/Medication + +{:.h2_title} +## Benchmarking + +Since this NER model is crafted from `ner_posology` but reduced to single entity, no benchmark is applicable. diff --git a/docs/_posts/dcecchini/2021-07-23-cantemist_scielowiki_es.md b/docs/_posts/dcecchini/2021-07-23-cantemist_scielowiki_es.md index 4d7b953acb4de..f8421359f05a1 100644 --- a/docs/_posts/dcecchini/2021-07-23-cantemist_scielowiki_es.md +++ b/docs/_posts/dcecchini/2021-07-23-cantemist_scielowiki_es.md @@ -1,343 +1,338 @@ ---- -layout: model -title: Detect tumor morphology in Spanish texts -author: John Snow Labs -name: cantemist_scielowiki -date: 2021-07-23 -tags: [ner, licensed, oncology, es] -task: Named Entity Recognition -language: es -edition: Healthcare NLP 3.1.2 -spark_version: 3.0 -supported: true -article_header: - type: cover -use_language_switcher: "Python-Scala-Java" ---- - - -## Description - - -Detect tumor morphology entities in Spanish text. - - -## Predicted Entities - - -`MORFOLOGIA_NEOPLASIA`, `O` - - -{:.btn-box} -[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_TUMOR_ES/){:.button.button-orange} -[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/cantemist_scielowiki_es_3.1.2_3.0_1627080305994.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - - -## How to use - - - - - - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} - -```python -document_assembler = DocumentAssembler()\ - .setInputCol('text')\ - .setOutputCol('document') - -sentence = SentenceDetector() \ - .setInputCols(["document"]) \ - .setOutputCol("sentence") - -tokenizer = Tokenizer() \ - .setInputCols(["sentence"]) \ - .setOutputCol("token") - -embedings_stage = WordEmbeddingsModel.pretrained("embeddings_scielowiki_300d", "es", "clinical/models")\ - .setInputCols(["document","token"])\ - .setOutputCol("word_embeddings") - -clinical_ner = MedicalNerModel.pretrained("cantemist_scielowiki", "es", "clinical/models")\ - .setInputCols(["sentence", "token", "embeddings"])\ - .setOutputCol("ner") - -ner_converter = NerConverter() \ - .setInputCols(['document', 'token', 'ner']) \ - .setOutputCol('ner_chunk') - -pipeline = Pipeline(stages=[ - document_assembler, - sentence, - tokenizer, - embedings_stage, - clinical_ner, - ner_converter -]) - -example = spark.createDataFrame(pd.DataFrame({'text': ["""Anamnesis Paciente de 37 años de edad sin antecedentes patológicos ni quirúrgicos de interés. En diciembre de 2012 consultó al Servicio de Urgencias por un cuadro de cefalea aguda e hipostesia del hemicuerpo izquierdo de 15 días de evolución refractario a tratamiento. Exploración neurológica sin focalidad; fondo de ojo: papiledema unilateral. Se solicitaron una TC del SNC, que objetiva una LOE frontal derecha con afectación aparente del cuerpo calloso, y una RM del SNC, que muestra un extenso proceso expansivo intraparenquimatoso frontal derecho que infiltra la rodilla del cuerpo calloso, mal delimitada y sin componente necrótico. Tras la administración de contraste se apreciaban diferentes realces parcheados en la lesión, pero sin definirse una cápsula con aumento del flujo sanguíneo en la lesión, características compatibles con linfoma o astrocitoma anaplásico . El 3 de enero de 2013 se efectúa biopsia intraoperatoria, con diagnóstico histológico de astrocitoma anaplásico GIII"""]})) - -result = pipeline.fit(example).transform(example) -``` -```scala -val document_assembler = new DocumentAssembler() - .setInputCol('text') - .setOutputCol('document') - -val sentence = new SentenceDetector() - .setInputCols(["document"]) - .setOutputCol("sentence") - -val tokenizer = new Tokenizer() - .setInputCols(["sentence"]) - .setOutputCol("token") - -val embedings_stage = WordEmbeddingsModel.pretrained("embeddings_scielowiki_300d", "es", "clinical/models") - .setInputCols(["document","token"]) - .setOutputCol("word_embeddings") - -val clinical_ner = MedicalNerModel.pretrained("cantemist_scielowiki", "es", "clinical/models") - .setInputCols(["sentence", "token", "embeddings"]) - .setOutputCol("ner") - -val ner_converter = new NerConverter() - .setInputCols(['document', 'token', 'ner']) - .setOutputCol('ner_chunk') - -val pipeline = new Pipeline().setStages(Array(document_assembler, sentence, tokenizer, embedings_stage, clinical_ner, ner_converter)) - -val data = Seq("""Anamnesis Paciente de 37 años de edad sin antecedentes patológicos ni quirúrgicos de interés. En diciembre de 2012 consultó al Servicio de Urgencias por un cuadro de cefalea aguda e hipostesia del hemicuerpo izquierdo de 15 días de evolución refractario a tratamiento. Exploración neurológica sin focalidad; fondo de ojo: papiledema unilateral. Se solicitaron una TC del SNC, que objetiva una LOE frontal derecha con afectación aparente del cuerpo calloso, y una RM del SNC, que muestra un extenso proceso expansivo intraparenquimatoso frontal derecho que infiltra la rodilla del cuerpo calloso, mal delimitada y sin componente necrótico. Tras la administración de contraste se apreciaban diferentes realces parcheados en la lesión, pero sin definirse una cápsula con aumento del flujo sanguíneo en la lesión, características compatibles con linfoma o astrocitoma anaplásico . El 3 de enero de 2013 se efectúa biopsia intraoperatoria, con diagnóstico histológico de astrocitoma anaplásico GIII""").toDS.toDF("text") - -val result = pipeline.fit(data).transform(data) -``` -
- - -## Results - - -```bash -+---------------------+----------------------+ -| token | prediction | -+---------------------+----------------------+ -| Anamnesis | O | -| Paciente | O | -| de | O | -| 37 | O | -| años | O | -| de | O | -| edad | O | -| sin | O | -| antecedentes | O | -| patológicos | O | -| ni | O | -| quirúrgicos | O | -| de | O | -| interés | O | -| . | O | -| En | O | -| diciembre | O | -| de | O | -| 2012 | O | -| consultó | O | -| al | O | -| Servicio | O | -| de | O | -| Urgencias | O | -| por | O | -| un | O | -| cuadro | O | -| de | O | -| cefalea | O | -| aguda | O | -| e | O | -| hipostesia | O | -| del | O | -| hemicuerpo | O | -| izquierdo | O | -| de | O | -| 15 | O | -| días | O | -| de | O | -| evolución | O | -| refractario | O | -| a | O | -| tratamiento | O | -| . | O | -| Exploración | O | -| neurológica | O | -| sin | O | -| focalidad | O | -| ; | O | -| fondo | O | -| de | O | -| ojo | O | -| : | O | -| papiledema | O | -| unilateral | O | -| . | O | -| Se | O | -| solicitaron | O | -| una | O | -| TC | O | -| del | O | -| SNC | B-MORFOLOGIA_NEOP... | -| , | O | -| que | O | -| objetiva | O | -| una | O | -| LOE | O | -| frontal | O | -| derecha | O | -| con | O | -| afectación | B-MORFOLOGIA_NEOP... | -| aparente | I-MORFOLOGIA_NEOP... | -| del | I-MORFOLOGIA_NEOP... | -| cuerpo | I-MORFOLOGIA_NEOP... | -| calloso | I-MORFOLOGIA_NEOP... | -| , | O | -| y | O | -| una | O | -| RM | B-MORFOLOGIA_NEOP... | -| del | I-MORFOLOGIA_NEOP... | -| SNC | I-MORFOLOGIA_NEOP... | -| , | O | -| que | O | -| muestra | O | -| un | O | -| extenso | O | -| proceso | B-MORFOLOGIA_NEOP... | -| expansivo | I-MORFOLOGIA_NEOP... | -| intraparenquimatoso | I-MORFOLOGIA_NEOP... | -| frontal | I-MORFOLOGIA_NEOP... | -| derecho | I-MORFOLOGIA_NEOP... | -| que | I-MORFOLOGIA_NEOP... | -| infiltra | I-MORFOLOGIA_NEOP... | -| la | I-MORFOLOGIA_NEOP... | -| rodilla | I-MORFOLOGIA_NEOP... | -| del | I-MORFOLOGIA_NEOP... | -| cuerpo | I-MORFOLOGIA_NEOP... | -| calloso | I-MORFOLOGIA_NEOP... | -| , | O | -| mal | O | -| delimitada | O | -| y | O | -| sin | O | -| componente | O | -| necrótico | O | -| . | O | -| Tras | O | -| la | O | -| administración | O | -| de | O | -| contraste | O | -| se | O | -| apreciaban | O | -| diferentes | O | -| realces | O | -| parcheados | O | -| en | O | -| la | O | -| lesión | O | -| , | O | -| pero | O | -| sin | O | -| definirse | O | -| una | O | -| cápsula | O | -| con | O | -| aumento | O | -| del | O | -| flujo | O | -| sanguíneo | O | -| en | O | -| la | O | -| lesión | O | -| , | O | -| características | O | -| compatibles | O | -| con | O | -| linfoma | O | -| o | O | -| astrocitoma | B-MORFOLOGIA_NEOP... | -| anaplásico | I-MORFOLOGIA_NEOP... | -| . | O | -| El | O | -| 3 | O | -| de | O | -| enero | O | -| de | O | -| 2013 | O | -| se | O | -| efectúa | O | -| biopsia | O | -| intraoperatoria | O | -| , | O | -| con | O | -| diagnóstico | O | -| histológico | O | -| de | O | -| astrocitoma | B-MORFOLOGIA_NEOP... | -| anaplásico | I-MORFOLOGIA_NEOP... | -| GIII | I-MORFOLOGIA_NEOP... | -+---------------------+----------------------+ -``` - - -{:.model-param} -## Model Information - - -{:.table-model} -|---|---| -|Model Name:|cantemist_scielowiki| -|Compatibility:|Healthcare NLP 3.1.2+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[sentence, token, word_embeddings]| -|Output Labels:|[ner]| -|Language:|es| -|Dependencies:|embeddings_scielowiki_300d| - - -## Data Source - - -The model was trained with the [CANTEMIST](https://temu.bsc.es/cantemist/) data set: - - -> CANTEMIST is an annotated data set for oncology analysis in the Spanish language containing 1301 oncological clinical case reports with a total of 63,016 sentences and 1093,501 tokens. All documents of the corpus have been manually annotated by clinical experts with -mentions of tumor morphology (in Spanish, “morfología de neoplasia”). There are 16,030 tumor morphology mentions mapped to an eCIE-O code (850 unique codes) - - - - -References: - - - - -1. P. Ruas, A. Neves, V. D. Andrade, F. M. Couto, Lasigebiotm at cantemist: Named entity recognition and normalization of tumour morphology entities and clinical coding of Spanish health-related documents, in: Proceedings of the Iberian Languages Evaluation Forum (IberLEF 2020), CEUR Workshop Proceedings, 2020 - - -2. Antonio Miranda-Escalada, Eulàlia Farré-Maduell, Martin Krallinger. Named Entity Recognition, Concept Normalization and Clinical Coding: Overview of the Cantemist Track for Cancer Text Mining in Spanish, Corpus, Guidelines, Methods and Results. Proceedings of the Iberian Languages Evaluation Forum (IberLEF 2020), CEUR Workshop Proceedings. 303-323 (2020). - - -## Benchmarking - - -```bash -label precision recall f1-score support -B-MORFOLOGIA_NEOPLASIA 0.94 0.73 0.83 2474 -I-MORFOLOGIA_NEOPLASIA 0.81 0.74 0.77 3169 -O 0.99 1.00 1.00 283006 -accuracy - - 0.99 288649 -macro-avg 0.92 0.82 0.87 288649 -weighted-avg 0.99 0.99 0.99 288649 -``` - \ No newline at end of file +--- +layout: model +title: Detect tumor morphology in Spanish texts +author: John Snow Labs +name: cantemist_scielowiki +date: 2021-07-23 +tags: [ner, licensed, oncology, es] +task: Named Entity Recognition +language: es +edition: Spark NLP for Healthcare 3.1.2 +spark_version: 3.0 +supported: true +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + + +## Description + +Detect tumor morphology entities in Spanish text. + + +## Predicted Entities + +`MORFOLOGIA_NEOPLASIA`, `O` + + +{:.btn-box} +[Live Demo](https://demo.johnsnowlabs.com/healthcare/NER_TUMOR_ES/){:.button.button-orange} +[Open in Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/1.Clinical_Named_Entity_Recognition_Model.ipynb){:.button.button-orange.button-orange-trans.co.button-icon} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/cantemist_scielowiki_es_3.1.2_3.0_1627080305994.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + + +## How to use + + + + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +document_assembler = DocumentAssembler()\ + .setInputCol('text')\ + .setOutputCol('document') + +sentence = SentenceDetector() \ + .setInputCols(["document"]) \ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +embedings_stage = WordEmbeddingsModel.pretrained("embeddings_scielowiki_300d", "es", "clinical/models")\ + .setInputCols(["document","token"])\ + .setOutputCol("word_embeddings") + +clinical_ner = MedicalNerModel.pretrained("cantemist_scielowiki", "es", "clinical/models")\ + .setInputCols(["sentence", "token", "word_embeddings"])\ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(['document', 'token', 'ner']) \ + .setOutputCol('ner_chunk') + +pipeline = Pipeline(stages=[ + document_assembler, + sentence, + tokenizer, + embedings_stage, + clinical_ner, + ner_converter +]) + +data = spark.createDataFrame([["""Anamnesis Paciente de 37 años de edad sin antecedentes patológicos ni quirúrgicos de interés. En diciembre de 2012 consultó al Servicio de Urgencias por un cuadro de cefalea aguda e hipostesia del hemicuerpo izquierdo de 15 días de evolución refractario a tratamiento. Exploración neurológica sin focalidad; fondo de ojo: papiledema unilateral. Se solicitaron una TC del SNC, que objetiva una LOE frontal derecha con afectación aparente del cuerpo calloso, y una RM del SNC, que muestra un extenso proceso expansivo intraparenquimatoso frontal derecho que infiltra la rodilla del cuerpo calloso, mal delimitada y sin componente necrótico. Tras la administración de contraste se apreciaban diferentes realces parcheados en la lesión, pero sin definirse una cápsula con aumento del flujo sanguíneo en la lesión, características compatibles con linfoma o astrocitoma anaplásico . El 3 de enero de 2013 se efectúa biopsia intraoperatoria, con diagnóstico histológico de astrocitoma anaplásico GIII"""]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentence = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val embedings_stage = WordEmbeddingsModel.pretrained("embeddings_scielowiki_300d", "es", "clinical/models") + .setInputCols(Array("document","token")) + .setOutputCol("word_embeddings") + +val clinical_ner = MedicalNerModel.pretrained("cantemist_scielowiki", "es", "clinical/models") + .setInputCols(Array("sentence", "token", "word_embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("document", "token", "ner")) + .setOutputCol("ner_chunk") + +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence, tokenizer, embedings_stage, clinical_ner, ner_converter)) + +val data = Seq("""Anamnesis Paciente de 37 años de edad sin antecedentes patológicos ni quirúrgicos de interés. En diciembre de 2012 consultó al Servicio de Urgencias por un cuadro de cefalea aguda e hipostesia del hemicuerpo izquierdo de 15 días de evolución refractario a tratamiento. Exploración neurológica sin focalidad; fondo de ojo: papiledema unilateral. Se solicitaron una TC del SNC, que objetiva una LOE frontal derecha con afectación aparente del cuerpo calloso, y una RM del SNC, que muestra un extenso proceso expansivo intraparenquimatoso frontal derecho que infiltra la rodilla del cuerpo calloso, mal delimitada y sin componente necrótico. Tras la administración de contraste se apreciaban diferentes realces parcheados en la lesión, pero sin definirse una cápsula con aumento del flujo sanguíneo en la lesión, características compatibles con linfoma o astrocitoma anaplásico . El 3 de enero de 2013 se efectúa biopsia intraoperatoria, con diagnóstico histológico de astrocitoma anaplásico GIII""").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ + +## Results + + +```bash ++---------------------+----------------------+ +| token | prediction | ++---------------------+----------------------+ +| Anamnesis | O | +| Paciente | O | +| de | O | +| 37 | O | +| años | O | +| de | O | +| edad | O | +| sin | O | +| antecedentes | O | +| patológicos | O | +| ni | O | +| quirúrgicos | O | +| de | O | +| interés | O | +| . | O | +| En | O | +| diciembre | O | +| de | O | +| 2012 | O | +| consultó | O | +| al | O | +| Servicio | O | +| de | O | +| Urgencias | O | +| por | O | +| un | O | +| cuadro | O | +| de | O | +| cefalea | O | +| aguda | O | +| e | O | +| hipostesia | O | +| del | O | +| hemicuerpo | O | +| izquierdo | O | +| de | O | +| 15 | O | +| días | O | +| de | O | +| evolución | O | +| refractario | O | +| a | O | +| tratamiento | O | +| . | O | +| Exploración | O | +| neurológica | O | +| sin | O | +| focalidad | O | +| ; | O | +| fondo | O | +| de | O | +| ojo | O | +| : | O | +| papiledema | O | +| unilateral | O | +| . | O | +| Se | O | +| solicitaron | O | +| una | O | +| TC | O | +| del | O | +| SNC | B-MORFOLOGIA_NEOP... | +| , | O | +| que | O | +| objetiva | O | +| una | O | +| LOE | O | +| frontal | O | +| derecha | O | +| con | O | +| afectación | B-MORFOLOGIA_NEOP... | +| aparente | I-MORFOLOGIA_NEOP... | +| del | I-MORFOLOGIA_NEOP... | +| cuerpo | I-MORFOLOGIA_NEOP... | +| calloso | I-MORFOLOGIA_NEOP... | +| , | O | +| y | O | +| una | O | +| RM | B-MORFOLOGIA_NEOP... | +| del | I-MORFOLOGIA_NEOP... | +| SNC | I-MORFOLOGIA_NEOP... | +| , | O | +| que | O | +| muestra | O | +| un | O | +| extenso | O | +| proceso | B-MORFOLOGIA_NEOP... | +| expansivo | I-MORFOLOGIA_NEOP... | +| intraparenquimatoso | I-MORFOLOGIA_NEOP... | +| frontal | I-MORFOLOGIA_NEOP... | +| derecho | I-MORFOLOGIA_NEOP... | +| que | I-MORFOLOGIA_NEOP... | +| infiltra | I-MORFOLOGIA_NEOP... | +| la | I-MORFOLOGIA_NEOP... | +| rodilla | I-MORFOLOGIA_NEOP... | +| del | I-MORFOLOGIA_NEOP... | +| cuerpo | I-MORFOLOGIA_NEOP... | +| calloso | I-MORFOLOGIA_NEOP... | +| , | O | +| mal | O | +| delimitada | O | +| y | O | +| sin | O | +| componente | O | +| necrótico | O | +| . | O | +| Tras | O | +| la | O | +| administración | O | +| de | O | +| contraste | O | +| se | O | +| apreciaban | O | +| diferentes | O | +| realces | O | +| parcheados | O | +| en | O | +| la | O | +| lesión | O | +| , | O | +| pero | O | +| sin | O | +| definirse | O | +| una | O | +| cápsula | O | +| con | O | +| aumento | O | +| del | O | +| flujo | O | +| sanguíneo | O | +| en | O | +| la | O | +| lesión | O | +| , | O | +| características | O | +| compatibles | O | +| con | O | +| linfoma | O | +| o | O | +| astrocitoma | B-MORFOLOGIA_NEOP... | +| anaplásico | I-MORFOLOGIA_NEOP... | +| . | O | +| El | O | +| 3 | O | +| de | O | +| enero | O | +| de | O | +| 2013 | O | +| se | O | +| efectúa | O | +| biopsia | O | +| intraoperatoria | O | +| , | O | +| con | O | +| diagnóstico | O | +| histológico | O | +| de | O | +| astrocitoma | B-MORFOLOGIA_NEOP... | +| anaplásico | I-MORFOLOGIA_NEOP... | +| GIII | I-MORFOLOGIA_NEOP... | ++---------------------+----------------------+ +``` + + +{:.model-param} +## Model Information + + +{:.table-model} +|---|---| +|Model Name:|cantemist_scielowiki| +|Compatibility:|Spark NLP for Healthcare 3.1.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, word_embeddings]| +|Output Labels:|[ner]| +|Language:|es| +|Dependencies:|embeddings_scielowiki_300d| + + +## Data Source + + +The model was trained with the [CANTEMIST](https://temu.bsc.es/cantemist/) data set: + + +> CANTEMIST is an annotated data set for oncology analysis in the Spanish language containing 1301 oncological clinical case reports with a total of 63,016 sentences and 1093,501 tokens. All documents of the corpus have been manually annotated by clinical experts with +mentions of tumor morphology (in Spanish, “morfología de neoplasia”). There are 16,030 tumor morphology mentions mapped to an eCIE-O code (850 unique codes) + + + + +References: + + + + +1. P. Ruas, A. Neves, V. D. Andrade, F. M. Couto, Lasigebiotm at cantemist: Named entity recognition and normalization of tumour morphology entities and clinical coding of Spanish health-related documents, in: Proceedings of the Iberian Languages Evaluation Forum (IberLEF 2020), CEUR Workshop Proceedings, 2020 + + +2. Antonio Miranda-Escalada, Eulàlia Farré-Maduell, Martin Krallinger. Named Entity Recognition, Concept Normalization and Clinical Coding: Overview of the Cantemist Track for Cancer Text Mining in Spanish, Corpus, Guidelines, Methods and Results. Proceedings of the Iberian Languages Evaluation Forum (IberLEF 2020), CEUR Workshop Proceedings. 303-323 (2020). + + +## Benchmarking + + +```bash +label precision recall f1-score support +B-MORFOLOGIA_NEOPLASIA 0.94 0.73 0.83 2474 +I-MORFOLOGIA_NEOPLASIA 0.81 0.74 0.77 3169 +O 0.99 1.00 1.00 283006 +accuracy - - 0.99 288649 +macro-avg 0.92 0.82 0.87 288649 +weighted-avg 0.99 0.99 0.99 288649 +``` \ No newline at end of file diff --git a/docs/_posts/gpirge/2022-07-28-bert_token_classifier_disease_mentions_tweet_es_3_0.md b/docs/_posts/gpirge/2022-07-28-bert_token_classifier_disease_mentions_tweet_es_3_0.md index 0aafdc139dbab..cdf0e8547d200 100644 --- a/docs/_posts/gpirge/2022-07-28-bert_token_classifier_disease_mentions_tweet_es_3_0.md +++ b/docs/_posts/gpirge/2022-07-28-bert_token_classifier_disease_mentions_tweet_es_3_0.md @@ -65,11 +65,12 @@ pipeline = Pipeline(stages=[ tokenClassifier, ner_converter]) -model = pipeline.fit(spark.createDataFrame(pd.DataFrame({'text': ['']}))) + +data = spark.createDataFrame([["""El diagnóstico fueron varios. Principal: Neumonía en el pulmón derecho. Sinusitis de caballo, Faringitis aguda e infección de orina, también elevada. Gripe No. Estuvo hablando conmigo, sin exagerar, mas de media hora, dándome ánimo y fuerza y que sabe, porque ha visto."""]]).toDF("text") -data = spark.createDataFrame(["El diagnóstico fueron varios. Principal: Neumonía en el pulmón derecho. Sinusitis de caballo, Faringitis aguda e infección de orina, también elevada. Gripe No. Estuvo hablando conmigo, sin exagerar, mas de media hora, dándome ánimo y fuerza y que sabe, porque ha visto"], StringType()).toDF("text") -result = model.transform(data) +result = pipeline.fit(data).transform(data) + ``` ```scala val documentAssembler = new DocumentAssembler() @@ -103,7 +104,7 @@ val pipeline = new Pipeline().setStages(Array( val data = Seq(Array("El diagnóstico fueron varios. Principal: Neumonía en el pulmón derecho. Sinusitis de caballo, Faringitis aguda e infección de orina, también elevada. Gripe No. Estuvo hablando conmigo, sin exagerar, mas de media hora, dándome ánimo y fuerza y que sabe, porque ha visto")).toDS().toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/gpirge/2022-07-28-bert_token_classifier_ner_pathogen_en_3_0.md b/docs/_posts/gpirge/2022-07-28-bert_token_classifier_ner_pathogen_en_3_0.md index a9c91dfb68c0f..4b750aba68005 100644 --- a/docs/_posts/gpirge/2022-07-28-bert_token_classifier_ner_pathogen_en_3_0.md +++ b/docs/_posts/gpirge/2022-07-28-bert_token_classifier_ner_pathogen_en_3_0.md @@ -66,11 +66,10 @@ pipeline = Pipeline(stages=[ ]) -model = pipeline.fit(spark.createDataFrame(pd.DataFrame({'text': ['']}))) +data = spark.createDataFrame([["""Racecadotril is an antisecretory medication and it has better tolerability than loperamide. Diarrhea is the condition of having loose, liquid or watery bowel movements each day. Signs of dehydration often begin with loss of the normal stretchiness of the skin. This can progress to loss of skin color, a fast heart rate as it becomes more severe; while it has been speculated that rabies virus, Lyssavirus and Ephemerovirus could be transmitted through aerosols, studies have concluded that this is only feasible in limited conditions."""]]).toDF("text") -data = spark.createDataFrame(["""Racecadotril is an antisecretory medication and it has better tolerability than loperamide. Diarrhea is the condition of having loose, liquid or watery bowel movements each day. Signs of dehydration often begin with loss of the normal stretchiness of the skin. This can progress to loss of skin color, a fast heart rate as it becomes more severe; while it has been speculated that rabies virus, Lyssavirus and Ephemerovirus could be transmitted through aerosols, studies have concluded that this is only feasible in limited conditions."""], StringType()).toDF("text") +result = pipeline.fit(data).transform(data) -result = model.transform(data) ``` ```scala val documenter = new DocumentAssembler() @@ -87,7 +86,7 @@ val tokenizer = new Tokenizer() val tokenClassifier = MedicalBertForTokenClassifier.pretrained("bert_token_classifier_ner_pathogen", "en", "clinical/models") .setInputCols(Array("token", 'sentence')) - .setOutputCol("label")\ + .setOutputCol("label") .setCaseSensitive(True) val ner_converter = new NerConverter() @@ -152,4 +151,4 @@ MedicalCondition 0.73 0.78 0.75 49 micro-avg 0.82 0.86 0.84 98 macro-avg 0.82 0.88 0.84 98 weighted-avg 0.82 0.86 0.84 98 -``` \ No newline at end of file +``` diff --git a/docs/_posts/gpirge/2022-08-11-bert_token_classifier_pharmacology_es_3_0.md b/docs/_posts/gpirge/2022-08-11-bert_token_classifier_pharmacology_es_3_0.md index 479f5fc93ebc2..3d4716a7f0547 100644 --- a/docs/_posts/gpirge/2022-08-11-bert_token_classifier_pharmacology_es_3_0.md +++ b/docs/_posts/gpirge/2022-08-11-bert_token_classifier_pharmacology_es_3_0.md @@ -66,12 +66,11 @@ pipeline = Pipeline(stages=[ tokenClassifier, ner_converter]) -model = pipeline.fit(spark.createDataFrame(pd.DataFrame({'text': ['']}))) + +data = spark.createDataFrame([["""Se realiza analítica destacando creatinkinasa 736 UI, LDH 545 UI, urea 63 mg/dl, CA 19.9 64,1 U/ml. Inmunofenotípicamente el tumor expresó vimentina, S-100, HMB-45 y actina. Se instauró el tratamiento con quimioterapia (Cisplatino, Interleukina II, Dacarbacina e Interferon alfa)."""]]).toDF("text") +result = pipeline.fit(data).transform(data) -data = spark.createDataFrame(["Se realiza analítica destacando creatinkinasa 736 UI, LDH 545 UI, urea 63 mg/dl, CA 19.9 64,1 U/ml. Inmunofenotípicamente el tumor expresó vimentina, S-100, HMB-45 y actina. Se instauró el tratamiento con quimioterapia (Cisplatino, Interleukina II, Dacarbacina e Interferon alfa)."], StringType()).toDF("text") - -result = model.transform(data) ``` ```scala val documentAssembler = new DocumentAssembler() @@ -105,7 +104,7 @@ val pipeline = new Pipeline().setStages(Array( val data = Seq(Array("Se realiza analítica destacando creatinkinasa 736 UI, LDH 545 UI, urea 63 mg/dl, CA 19.9 64,1 U/ml. Inmunofenotípicamente el tumor expresó vimentina, S-100, HMB-45 y actina. Se instauró el tratamiento con quimioterapia (Cisplatino, Interleukina II, Dacarbacina e Interferon alfa).")).toDS().toDF("text") -val result = model.fit(data).transform(data) +val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/gpirge/2022-08-13-ner_negation_uncertainty_es_3_0.md b/docs/_posts/gpirge/2022-08-13-ner_negation_uncertainty_es_3_0.md index a928c9f6b2fe2..6d7f5cb2e2cff 100644 --- a/docs/_posts/gpirge/2022-08-13-ner_negation_uncertainty_es_3_0.md +++ b/docs/_posts/gpirge/2022-08-13-ner_negation_uncertainty_es_3_0.md @@ -61,7 +61,7 @@ ner_converter = NerConverter()\ .setInputCols(["sentence", "token", "ner"])\ .setOutputCol("ner_chunk") -pipeline = pipeline(stages=[ +pipeline = Pipeline(stages=[ document_assembler, sentenceDetectorDL, tokenizer, @@ -75,8 +75,8 @@ result = pipeline.fit(data).transform(data) ``` ```scala val documenter = new DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") + .setInputCol("text") + .setOutputCol("document") val sentenceDetector = SentenceDetectorDLModel.pretrained() .setInputCols("document") @@ -87,16 +87,16 @@ val tokenizer = new Tokenizer() .setOutputCol("token") val word_embeddings = RoBertaEmbeddings.pretrained("roberta_base_biomedical", "es") - .setInputCols(["sentence","token"]) - .setOutputCol("embeddings") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") val ner_model = MedicalNerModel.pretrained("ner_negation_uncertainty", "es", "clinical/models") - .setInputCols(Array("sentence", "token", "embeddings")) - .setOutputCol("ner") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") val ner_converter = new NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) diff --git a/docs/_posts/gpirge/2022-08-13-ner_pharmacology_es_3_0.md b/docs/_posts/gpirge/2022-08-13-ner_pharmacology_es_3_0.md index 863bacf470d07..d9763accc8ac8 100644 --- a/docs/_posts/gpirge/2022-08-13-ner_pharmacology_es_3_0.md +++ b/docs/_posts/gpirge/2022-08-13-ner_pharmacology_es_3_0.md @@ -60,7 +60,7 @@ ner_converter = NerConverter()\ .setInputCols(["sentence", "token", "ner"])\ .setOutputCol("ner_chunk") -pipeline = pipeline(stages=[ +pipeline = Pipeline(stages=[ document_assembler, sentenceDetectorDL, tokenizer, @@ -74,8 +74,8 @@ result = pipeline.fit(data).transform(data) ``` ```scala val documenter = new DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") + .setInputCol("text") + .setOutputCol("document") val sentenceDetector = SentenceDetectorDLModel.pretrained() .setInputCols("document") @@ -86,16 +86,16 @@ val tokenizer = new Tokenizer() .setOutputCol("token") val word_embeddings = RoBertaEmbeddings.pretrained("roberta_base_biomedical", "es") - .setInputCols(["sentence","token"]) - .setOutputCol("embeddings") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") val ner_model = MedicalNerModel.pretrained("ner_pharmacology", "es", "clinical/models") - .setInputCols(Array("sentence", "token", "embeddings")) - .setOutputCol("ner") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") val ner_converter = new NerConverter() - .setInputCols(Array("sentence", "token", "ner")) - .setOutputCol("ner_chunk") + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") val pipeline = new Pipeline().setStages(Array(documenter, sentenceDetector, tokenizer, word_embeddings, ner_model, ner_converter)) diff --git a/docs/_posts/gpirge/2022-08-14-disease_mentions_tweet_es_3_0.md b/docs/_posts/gpirge/2022-08-14-disease_mentions_tweet_es_3_0.md index 96da4594e242e..23c9e59c6ce77 100644 --- a/docs/_posts/gpirge/2022-08-14-disease_mentions_tweet_es_3_0.md +++ b/docs/_posts/gpirge/2022-08-14-disease_mentions_tweet_es_3_0.md @@ -61,7 +61,7 @@ ner_converter = NerConverter()\ .setInputCols(["sentence", "token", "ner"])\ .setOutputCol("ner_chunk") -pipeline = pipeline(stages=[ +pipeline = Pipeline(stages=[ document_assembler, sentenceDetectorDL, tokenizer, @@ -79,16 +79,16 @@ val documenter = new DocumentAssembler() .setOutputCol("document") val sentenceDetector = SentenceDetectorDLModel.pretrained() - .setInputCols("document") - .setOutputCol("sentence") + .setInputCols("document") + .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") - .setOutputCol("token") + .setInputCols("sentence") + .setOutputCol("token") val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_scielo_300d","es","clinical/models") - .setInputCols(["sentence","token"]) - .setOutputCol("embeddings") + .setInputCols(Array("sentence","token")) + .setOutputCol("embeddings") val ner_model = MedicalNerModel.pretrained("disease_mentions_tweet", "es", "clinical/models") .setInputCols(Array("sentence", "token", "embeddings")) diff --git a/docs/_posts/mauro-nievoff/2022-07-25-assertion_oncology_treatment_binary_wip_en_3_0.md b/docs/_posts/mauro-nievoff/2022-07-25-assertion_oncology_treatment_binary_wip_en_3_0.md index cf185c1f4825d..e120fd6f0befe 100644 --- a/docs/_posts/mauro-nievoff/2022-07-25-assertion_oncology_treatment_binary_wip_en_3_0.md +++ b/docs/_posts/mauro-nievoff/2022-07-25-assertion_oncology_treatment_binary_wip_en_3_0.md @@ -84,11 +84,11 @@ val documentAssembler = new DocumentAssembler() .setOutputCol("document") val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-07-25-ner_oncology_wip_en_3_0.md b/docs/_posts/mauro-nievoff/2022-07-25-ner_oncology_wip_en_3_0.md index ca17b3f149111..6e3cc6a7ebb1d 100644 --- a/docs/_posts/mauro-nievoff/2022-07-25-ner_oncology_wip_en_3_0.md +++ b/docs/_posts/mauro-nievoff/2022-07-25-ner_oncology_wip_en_3_0.md @@ -78,11 +78,11 @@ val documentAssembler = new DocumentAssembler() .setOutputCol("document") val sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare", "en", "clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-26-re_oncology_size_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-26-re_oncology_size_wip_en.md index af0218fa8e8b1..58b9b44255573 100644 --- a/docs/_posts/mauro-nievoff/2022-09-26-re_oncology_size_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-26-re_oncology_size_wip_en.md @@ -93,11 +93,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_biomarker_result_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_biomarker_result_wip_en.md index 788970694e539..468acaad37a42 100644 --- a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_biomarker_result_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_biomarker_result_wip_en.md @@ -93,11 +93,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_granular_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_granular_wip_en.md index 350e2f275915e..6571f4251b2be 100644 --- a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_granular_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_granular_wip_en.md @@ -93,11 +93,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_location_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_location_wip_en.md index 62ab8af55dbee..65be0ffd8178d 100644 --- a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_location_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_location_wip_en.md @@ -94,11 +94,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_temporal_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_temporal_wip_en.md index 461cee0864a47..5dd9eb9ed1498 100644 --- a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_temporal_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_temporal_wip_en.md @@ -93,11 +93,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_test_result_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_test_result_wip_en.md index 156fcb8141d52..6c0ebdf0cd3d6 100644 --- a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_test_result_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_test_result_wip_en.md @@ -93,11 +93,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_wip_en.md index fdb341dde6b9a..5b23dda0034e3 100644 --- a/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-27-re_oncology_wip_en.md @@ -94,11 +94,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-28-redl_oncology_size_biobert_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-28-redl_oncology_size_biobert_wip_en.md index 6e2e94d0e2a3d..9ca6590453079 100644 --- a/docs/_posts/mauro-nievoff/2022-09-28-redl_oncology_size_biobert_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-28-redl_oncology_size_biobert_wip_en.md @@ -92,11 +92,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_biobert_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_biobert_wip_en.md index ee10a7483c247..3c186e0e3f8ad 100644 --- a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_biobert_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_biobert_wip_en.md @@ -92,11 +92,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_biomarker_result_biobert_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_biomarker_result_biobert_wip_en.md index 473d8ca55c30e..938439efd7221 100644 --- a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_biomarker_result_biobert_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_biomarker_result_biobert_wip_en.md @@ -99,11 +99,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_granular_biobert_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_granular_biobert_wip_en.md index c8f8e2886ae05..b5ea60a586afc 100644 --- a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_granular_biobert_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_granular_biobert_wip_en.md @@ -92,11 +92,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_location_biobert_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_location_biobert_wip_en.md index c5c11970f67ef..160f7c4d253c0 100644 --- a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_location_biobert_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_location_biobert_wip_en.md @@ -34,6 +34,7 @@ Use relation pairs to include only the combinations of entities that are relevan
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -71,7 +72,7 @@ re_ner_chunk_filter = RENerChunksFilter()\ .setInputCols(["ner_chunk", "dependencies"])\ .setOutputCol("re_ner_chunk")\ .setMaxSyntacticDistance(10)\ - .setRelationPairs([""Tumor_Finding-Site_Breast", "Site_Breast-Tumor_Finding", "Tumor_Finding-Anatomical_Site", "Anatomical_Site-Tumor_Finding""]) + .setRelationPairs(["Tumor_Finding-Site_Breast", "Site_Breast-Tumor_Finding", "Tumor_Finding-Anatomical_Site", "Anatomical_Site-Tumor_Finding"]) re_model = RelationExtractionDLModel.pretrained("redl_oncology_location_biobert_wip", "en", "clinical/models")\ .setInputCols(["re_ner_chunk", "sentence"])\ @@ -98,11 +99,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -126,14 +127,14 @@ val dependency_parser = DependencyParserModel.pretrained("dependency_conllu", "e .setOutputCol("dependencies") val re_ner_chunk_filter = new RENerChunksFilter() - .setInputCols("ner_chunk", "dependencies") + .setInputCols(Array("ner_chunk", "dependencies")) .setOutputCol("re_ner_chunk") .setMaxSyntacticDistance(10) .setRelationPairs(Array("Tumor_Finding-Site_Breast", "Site_Breast-Tumor_Finding","Tumor_Finding-Anatomical_Site", "Anatomical_Site-Tumor_Finding")) val re_model = RelationExtractionDLModel.pretrained("redl_oncology_location_biobert_wip", "en", "clinical/models") .setPredictionThreshold(0.5f) - .setInputCols("re_ner_chunk", "sentence") + .setInputCols(Array("re_ner_chunk", "sentence")) .setOutputCol("relation_extraction") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -147,7 +148,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, re_ner_chunk_filter, re_model)) -val data = Seq("In April 2011, she first noticed a lump in her right breast.").toDS.toDF("text") +val data = Seq("""In April 2011, she first noticed a lump in her right breast.""").toDS.toDF("text") val result = pipeline.fit(data).transform(data) ``` diff --git a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_temporal_biobert_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_temporal_biobert_wip_en.md index e09f0fd9625dc..19dff4ef71bde 100644 --- a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_temporal_biobert_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_temporal_biobert_wip_en.md @@ -99,11 +99,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_test_result_biobert_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_test_result_biobert_wip_en.md index 4d7949d1d59e8..2740c0099a649 100644 --- a/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_test_result_biobert_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-29-redl_oncology_test_result_biobert_wip_en.md @@ -98,11 +98,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_anatomy_general_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_anatomy_general_wip_en.md index 00748b9a503bc..1cb14bf26ec8b 100644 --- a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_anatomy_general_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_anatomy_general_wip_en.md @@ -76,11 +76,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_demographics_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_demographics_wip_en.md index 0a02cf3289bb6..a817a2e0f6354 100644 --- a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_demographics_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_demographics_wip_en.md @@ -76,11 +76,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") .setSplitChars(["-"]) diff --git a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_diagnosis_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_diagnosis_wip_en.md index f41b314591762..ec187be9cc99b 100644 --- a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_diagnosis_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_diagnosis_wip_en.md @@ -34,6 +34,7 @@ This model extracts entities related to cancer diagnosis, such as Metastasis, Hi
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -65,8 +66,7 @@ pipeline = Pipeline(stages=[document_assembler, ner, ner_converter]) -data = spark.createDataFrame([["Two years ago, the patient presented with a tumor in her left breast and adenopathies. She was diagnosed with invasive ductal carcinoma. -Last week she was also found to have a lung metastasis."]]).toDF("text") +data = spark.createDataFrame([["Two years ago, the patient presented with a tumor in her left breast and adenopathies. She was diagnosed with invasive ductal carcinoma.Last week she was also found to have a lung metastasis."]]).toDF("text") result = pipeline.fit(data).transform(data) ``` @@ -76,11 +76,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -160,4 +160,4 @@ Performance_Status 50.0 8.0 50.0 100.0 0.86 0.50 0.63 Grade 149.0 20.0 49.0 198.0 0.88 0.75 0.81 macro_avg 4022.0 549.0 1075.0 5097.0 0.87 0.73 0.79 micro_avg NaN NaN NaN NaN 0.88 0.79 0.83 -``` \ No newline at end of file +``` diff --git a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_test_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_test_wip_en.md index 12d0ab1a1edcc..1c44de05933a4 100644 --- a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_test_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_test_wip_en.md @@ -75,11 +75,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_therapy_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_therapy_wip_en.md index 4a8578c07af19..c2eba7ddb08eb 100644 --- a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_therapy_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_therapy_wip_en.md @@ -65,9 +65,7 @@ pipeline = Pipeline(stages=[document_assembler, ner, ner_converter]) -data = spark.createDataFrame([["The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. -The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to her breast. -The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy."]]).toDF("text") +data = spark.createDataFrame([["The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago.The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to her breast.The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy."]]).toDF("text") result = pipeline.fit(data).transform(data) @@ -78,11 +76,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_tnm_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_tnm_wip_en.md index 9bed4f7e002b0..d7c85f612bb23 100644 --- a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_tnm_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_tnm_wip_en.md @@ -75,11 +75,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_unspecific_posology_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_unspecific_posology_wip_en.md index a861c04f2b21e..e7ba0df6243c7 100644 --- a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_unspecific_posology_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_unspecific_posology_wip_en.md @@ -75,11 +75,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_wip_en.md b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_wip_en.md index 4d4a7e86fd2c5..17b39b7c600b3 100644 --- a/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-09-30-ner_oncology_wip_en.md @@ -65,9 +65,7 @@ pipeline = Pipeline(stages=[document_assembler, ner, ner_converter]) -data = spark.createDataFrame([["The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago. -The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to her breast. -The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy."]]).toDF("text") +data = spark.createDataFrame([["The had previously undergone a left mastectomy and an axillary lymph node dissection for a left breast cancer twenty years ago.The tumor was positive for ER and PR. Postoperatively, radiotherapy was administered to her breast.The cancer recurred as a right lung metastasis 13 years later. The patient underwent a regimen consisting of adriamycin (60 mg/m2) and cyclophosphamide (600 mg/m2) over six courses, as first line therapy."]]).toDF("text") result = pipeline.fit(data).transform(data) ``` @@ -77,11 +75,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_demographic_binary_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_demographic_binary_wip_en.md index e14cc3b40fc6c..ade8e9c7a4112 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_demographic_binary_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_demographic_binary_wip_en.md @@ -34,6 +34,7 @@ This model detects if a demographic entity refers to the patient or to someone e
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,8 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Age"", "Gender""]) + .setOutputCol("ner_chunk")\ + .setWhiteList(["Age", "Gender"]) assertion = AssertionDLModel.pretrained("assertion_oncology_demographic_binary_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,11 +83,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +101,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Age"", "Gender"")) + .setWhiteList(Array("Age", "Gender")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_demographic_binary_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -153,6 +155,6 @@ In-house annotated oncology case reports. label precision recall f1-score support Patient 0.94 0.94 0.94 32.0 Someone_Else 0.92 0.92 0.92 24.0 - macro avg 0.93 0.93 0.93 56.0 -weighted avg 0.93 0.93 0.93 56.0 -``` \ No newline at end of file + macro-avg 0.93 0.93 0.93 56.0 +weighted-avg 0.93 0.93 0.93 56.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_family_history_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_family_history_wip_en.md index 787820754574c..4741206c62532 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_family_history_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_family_history_wip_en.md @@ -34,6 +34,7 @@ This model detects entities refering to the family history.
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx""]) + .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx"]) assertion = AssertionDLModel.pretrained("assertion_oncology_family_history_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -71,7 +72,7 @@ pipeline = Pipeline(stages=[document_assembler, ner_converter, assertion]) -data = spark.createDataFrame([["Her family history is positive for breast cancer in her maternal aunt."]]).toDF("text") +data = spark.createDataFrame([["""Her family history is positive for breast cancer in her maternal aunt."""]]).toDF("text") result = pipeline.fit(data).transform(data) ``` @@ -81,11 +82,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +100,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Cancer_Dx"")) + .setWhiteList(Array("Cancer_Dx")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_family_history_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +114,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("Her family history is positive for breast cancer in her maternal aunt.").toDF("text") +val data = Seq("""Her family history is positive for breast cancer in her maternal aunt.""").toDF("text") val result = pipeline.fit(data).transform(data) ``` @@ -151,6 +152,6 @@ In-house annotated oncology case reports. label precision recall f1-score support Family_History 0.94 0.84 0.89 37.0 Other 0.91 0.97 0.94 62.0 - macro avg 0.92 0.90 0.91 99.0 - weighted avg 0.92 0.92 0.92 99.0 -``` \ No newline at end of file + macro-avg 0.92 0.90 0.91 99.0 + weighted-avg 0.92 0.92 0.92 99.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_problem_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_problem_wip_en.md index 7db54eb6fd744..b6ebafc4525a4 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_problem_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_problem_wip_en.md @@ -34,6 +34,7 @@ This model detects the assertion status of entities related to cancer diagnosis
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,8 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx""]) + .setOutputCol("ner_chunk").setWhiteList(["Cancer_Dx"]) + assertion = AssertionDLModel.pretrained("assertion_oncology_problem_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -71,7 +73,7 @@ pipeline = Pipeline(stages=[document_assembler, ner_converter, assertion]) -data = spark.createDataFrame([["The patient was diagnosed with breast cancer. Her family history is positive for other cancers."]]).toDF("text") +data = spark.createDataFrame([["""The patient was diagnosed with breast cancer. Her family history is positive for other cancers."""]]).toDF("text") result = pipeline.fit(data).transform(data) @@ -82,11 +84,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -100,10 +102,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Cancer_Dx"")) + .setWhiteList(Array("Cancer_Dx")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_problem_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -114,7 +116,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient was diagnosed with breast cancer. Her family history is positive for other cancers.").toDF("text") +val data = Seq("""The patient was diagnosed with breast cancer. Her family history is positive for other cancers.""").toDF("text") val result = pipeline.fit(data).transform(data) ``` @@ -156,6 +158,6 @@ In-house annotated oncology case reports. Hypothetical 0.81 0.77 0.79 77.0 Possible 0.62 0.61 0.62 54.0 Present 0.78 0.79 0.78 155.0 - macro avg 0.75 0.81 0.77 448.0 -weighted avg 0.80 0.79 0.79 448.0 -``` \ No newline at end of file + macro-avg 0.75 0.81 0.77 448.0 +weighted-avg 0.80 0.79 0.79 448.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_response_to_treatment_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_response_to_treatment_wip_en.md index f41c155c0b78d..08b69d8d82365 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_response_to_treatment_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_response_to_treatment_wip_en.md @@ -34,6 +34,7 @@ This model detects the assertion status of entities related to response to treat
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Response_To_Treatment""]) + .setOutputCol("ner_chunk") .setWhiteList(["Response_To_Treatment"]) assertion = AssertionDLModel.pretrained("assertion_oncology_response_to_treatment_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -82,11 +83,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -100,10 +101,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Response_To_Treatment"")) + .setWhiteList(Array("Response_To_Treatment")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_response_to_treatment_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -114,7 +115,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient presented no evidence of recurrence.").toDF("text") +val data = Seq("""The patient presented no evidence of recurrence.""").toDF("text") val result = pipeline.fit(data).transform(data) @@ -154,6 +155,6 @@ In-house annotated oncology case reports. label precision recall f1-score support Hypothetical_Or_Absent 0.83 0.96 0.89 46.0 Present_Or_Past 0.94 0.79 0.86 43.0 - macro avg 0.89 0.87 0.87 89.0 - weighted avg 0.89 0.88 0.88 89.0 -``` \ No newline at end of file + macro-avg 0.89 0.87 0.87 89.0 + weighted-avg 0.89 0.88 0.88 89.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_smoking_status_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_smoking_status_wip_en.md index 9034a199bb70f..e5877d8bcb9e3 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_smoking_status_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_smoking_status_wip_en.md @@ -34,6 +34,7 @@ This model detects the assertion status of the Smoking_Status entity. It classif
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Smoking_Status""]) + .setOutputCol("ner_chunk") .setWhiteList(["Smoking_Status"]) assertion = AssertionDLModel.pretrained("assertion_oncology_smoking_status_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,11 +82,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +100,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Smoking_Status"")) + .setWhiteList(Array("Smoking_Status")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_smoking_status_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +114,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient quit smoking three years ago.").toDF("text") +val data = Seq("""The patient quit smoking three years ago.""").toDF("text") val result = pipeline.fit(data).transform(data) ``` @@ -152,6 +153,6 @@ In-house annotated oncology case reports. Absent 0.75 1.00 0.86 12.0 Past 0.78 0.93 0.85 15.0 Present 1.00 0.46 0.63 13.0 - macro avg 0.84 0.80 0.78 40.0 -weighted avg 0.84 0.80 0.78 40.0 -``` \ No newline at end of file + macro-avg 0.84 0.80 0.78 40.0 +weighted-avg 0.84 0.80 0.78 40.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_test_binary_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_test_binary_wip_en.md index cb5363010d88c..e67a871b7f34d 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_test_binary_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_test_binary_wip_en.md @@ -34,6 +34,7 @@ This model detects the assertion status of oncology tests, such as Pathology_Tes
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Pathology_Test"", "Imaging_Test""]) + .setOutputCol("ner_chunk") .setWhiteList(["Pathology_Test", "Imaging_Test"]) assertion = AssertionDLModel.pretrained("assertion_oncology_test_binary_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,11 +82,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +100,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Pathology_Test"", "Imaging_Test"")) + .setWhiteList(Array("Pathology_Test", "Imaging_Test")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_test_binary_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +114,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The result of the biopsy was positive. We recommend to perform a CT scan.").toDF("text") +val data = Seq("""The result of the biopsy was positive. We recommend to perform a CT scan.""").toDF("text") val result = pipeline.fit(data).transform(data) ``` @@ -152,6 +153,6 @@ In-house annotated oncology case reports. label precision recall f1-score support Hypothetical_Or_Absent 0.79 0.81 0.80 37.0 Medical_History 0.80 0.78 0.79 36.0 - macro avg 0.79 0.79 0.79 73.0 - weighted avg 0.79 0.79 0.79 73.0 -``` \ No newline at end of file + macro-avg 0.79 0.79 0.79 73.0 + weighted-avg 0.79 0.79 0.79 73.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_treatment_binary_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_treatment_binary_wip_en.md index b43589a03d325..2ed33ed9252d1 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_treatment_binary_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_treatment_binary_wip_en.md @@ -34,6 +34,7 @@ This model detects the assertion status of oncology treatment entities. The mode
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Surgery"", "Chemotherapy""]) + .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Surgery", "Chemotherapy"]) assertion = AssertionDLModel.pretrained("assertion_oncology_treatment_binary_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,11 +82,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +100,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Cancer_Surgery"", "Chemotherapy"")) + .setWhiteList(Array("Cancer_Surgery", "Chemotherapy")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_treatment_binary_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +114,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient underwent a mastectomy two years ago. We recommend to start chemotherapy.").toDF("text") +val data = Seq("""The patient underwent a mastectomy two years ago. We recommend to start chemotherapy.""").toDF("text") val result = pipeline.fit(data).transform(data) @@ -155,6 +156,6 @@ In-house annotated oncology case reports. label precision recall f1-score support Hypothetical_Or_Absent 0.76 0.77 0.76 128.0 Present_Or_Past 0.75 0.73 0.74 118.0 - macro avg 0.75 0.75 0.75 246.0 - weighted avg 0.75 0.75 0.75 246.0 -``` \ No newline at end of file + macro-avg 0.75 0.75 0.75 246.0 + weighted-avg 0.75 0.75 0.75 246.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_wip_en.md index 18d6b95ade755..86118e727fb39 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-assertion_oncology_wip_en.md @@ -34,6 +34,7 @@ This model detects the assertion status of entities related to oncology (includi
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx"", "Tumor_Finding"", "Cancer_Surgery"", "Chemotherapy"", "Pathology_Test"", "Imaging_Test""]) + .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx", "Tumor_Finding", "Cancer_Surgery", "Chemotherapy", "Pathology_Test", "Imaging_Test"]) assertion = AssertionDLModel.pretrained("assertion_oncology_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -82,11 +83,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -100,10 +101,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Cancer_Dx"", "Tumor_Finding"", "Cancer_Surgery"", "Chemotherapy"", "Pathology_Test"", "Imaging_Test"")) + .setWhiteList(Array("Cancer_Dx", "Tumor_Finding", "Cancer_Surgery", "Chemotherapy", "Pathology_Test", "Imaging_Test")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -114,7 +115,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient is suspected to have breast cancer. Family history is positive for other cancers. The result of the biopsy was positive.").toDF("text") +val data = Seq("""The patient is suspected to have breast cancer. Family history is positive for other cancers. The result of the biopsy was positive.""").toDF("text") val result = pipeline.fit(data).transform(data) ``` @@ -158,6 +159,6 @@ Hypothetical 0.67 0.61 0.64 182.0 Past 0.91 0.93 0.92 1583.0 Possible 0.59 0.59 0.59 51.0 Present 0.89 0.89 0.89 1645.0 - macro avg 0.77 0.77 0.77 3759.0 -weighted avg 0.88 0.88 0.88 3759.0 -``` \ No newline at end of file + macro-avg 0.77 0.77 0.77 3759.0 +weighted-avg 0.88 0.88 0.88 3759.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_anatomy_granular_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_anatomy_granular_wip_en.md index e9d880330e765..b3a48b886086d 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_anatomy_granular_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_anatomy_granular_wip_en.md @@ -75,11 +75,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_biomarker_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_biomarker_wip_en.md index 8d7ce683fd778..b329b2783eb2e 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_biomarker_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_biomarker_wip_en.md @@ -75,11 +75,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_posology_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_posology_wip_en.md index d376547c0ac95..3afa8ca00f615 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_posology_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_posology_wip_en.md @@ -76,11 +76,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_response_to_treatment_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_response_to_treatment_wip_en.md index ada0107e5b0f6..479bfbf8a90ff 100644 --- a/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_response_to_treatment_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-01-ner_oncology_response_to_treatment_wip_en.md @@ -75,11 +75,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") diff --git a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_demographic_binary_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_demographic_binary_wip_en.md index a7bba289cb154..ac244ebc24530 100644 --- a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_demographic_binary_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_demographic_binary_wip_en.md @@ -34,6 +34,7 @@ This model detects if a demographic entity refers to the patient or to someone e
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Age"", "Gender""]) + .setOutputCol("ner_chunk") .setWhiteList(["Age", "Gender"]) assertion = AssertionDLModel.pretrained("assertion_oncology_demographic_binary_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,11 +82,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +100,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Age"", "Gender"")) + .setWhiteList(Array("Age", "Gender")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_demographic_binary_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +114,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("One sister was diagnosed with breast cancer at the age of 40.").toDF("text") +val data = Seq("""One sister was diagnosed with breast cancer at the age of 40.""").toDF("text") val result = pipeline.fit(data).transform(data) @@ -153,6 +154,6 @@ In-house annotated oncology case reports. label precision recall f1-score support Patient 0.93 0.86 0.89 29.0 Someone_Else 0.88 0.93 0.90 30.0 - macro avg 0.90 0.90 0.90 59.0 -weighted avg 0.90 0.90 0.90 59.0 -``` \ No newline at end of file + macro-avg 0.90 0.90 0.90 59.0 +weighted-avg 0.90 0.90 0.90 59.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_family_history_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_family_history_wip_en.md index 34d6814e9884b..936e70e523aa2 100644 --- a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_family_history_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_family_history_wip_en.md @@ -34,6 +34,7 @@ This model detects entities refering to the family history.
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx""]) + .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx"]) assertion = AssertionDLModel.pretrained("assertion_oncology_family_history_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,11 +82,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +100,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Cancer_Dx"")) + .setWhiteList(Array("Cancer_Dx")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_family_history_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +114,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("Her family history is positive for breast cancer in her maternal aunt.").toDF("text") +val data = Seq("""Her family history is positive for breast cancer in her maternal aunt.""").toDF("text") val result = pipeline.fit(data).transform(data) ``` @@ -152,6 +153,6 @@ In-house annotated oncology case reports. label precision recall f1-score support Family_History 0.88 0.96 0.92 24.0 Other 0.96 0.90 0.93 29.0 - macro avg 0.92 0.93 0.92 53.0 - weighted avg 0.93 0.92 0.92 53.0 -``` \ No newline at end of file + macro-avg 0.92 0.93 0.92 53.0 + weighted-avg 0.93 0.92 0.92 53.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_problem_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_problem_wip_en.md index 0873fbdf7d417..6cd535c9f30c5 100644 --- a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_problem_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_problem_wip_en.md @@ -57,7 +57,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx""]) + .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx"]) assertion = AssertionDLModel.pretrained("assertion_oncology_problem_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,11 +81,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +99,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Cancer_Dx"")) + .setWhiteList(Array("Cancer_Dx")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_problem_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +113,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient was diagnosed with breast cancer. Her family history is positive for other cancers.").toDF("text") +val data = Seq("""The patient was diagnosed with breast cancer. Her family history is positive for other cancers.""").toDF("text") val result = pipeline.fit(data).transform(data) ``` @@ -154,6 +154,6 @@ In-house annotated oncology case reports. Hypothetical_Or_Absent 0.87 0.81 0.84 310.0 Medical_History 0.76 0.86 0.81 304.0 Possible 0.71 0.61 0.65 92.0 - macro avg 0.77 0.76 0.76 718.0 - weighted avg 0.80 0.80 0.80 718.0 -``` \ No newline at end of file + macro-avg 0.77 0.76 0.76 718.0 + weighted-avg 0.80 0.80 0.80 718.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_response_to_treatment_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_response_to_treatment_wip_en.md index 38cb61cd793b9..882081d34f44b 100644 --- a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_response_to_treatment_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_response_to_treatment_wip_en.md @@ -34,6 +34,7 @@ This model detects the assertion status of entities related to response to treat
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Response_To_Treatment""]) + .setOutputCol("ner_chunk") .setWhiteList(["Response_To_Treatment"]) assertion = AssertionDLModel.pretrained("assertion_oncology_response_to_treatment_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,11 +82,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +100,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Response_To_Treatment"")) + .setWhiteList(Array("Response_To_Treatment")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_response_to_treatment_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +114,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient presented no evidence of recurrence.").toDF("text") +val data = Seq("""The patient presented no evidence of recurrence.""").toDF("text") val result = pipeline.fit(data).transform(data) @@ -152,6 +153,6 @@ In-house annotated oncology case reports. label precision recall f1-score support Hypothetical_Or_Absent 0.82 0.90 0.86 61.0 Present_Or_Past 0.89 0.80 0.84 61.0 - macro avg 0.86 0.85 0.85 122.0 - weighted avg 0.86 0.85 0.85 122.0 -``` \ No newline at end of file + macro-avg 0.86 0.85 0.85 122.0 + weighted-avg 0.86 0.85 0.85 122.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_smoking_status_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_smoking_status_wip_en.md index 195974c254294..ac96d8e3aa090 100644 --- a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_smoking_status_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_smoking_status_wip_en.md @@ -57,7 +57,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Smoking_Status""]) + .setOutputCol("ner_chunk") .setWhiteList(["Smoking_Status"]) assertion = AssertionDLModel.pretrained("assertion_oncology_smoking_status_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,7 +81,7 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() @@ -99,10 +99,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Smoking_Status"")) + .setWhiteList(Array("Smoking_Status")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_smoking_status_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +113,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient quit smoking three years ago.").toDF("text") +val data = Seq("""The patient quit smoking three years ago.""").toDF("text") val result = pipeline.fit(data).transform(data) ``` @@ -152,6 +152,6 @@ In-house annotated oncology case reports. Absent 0.58 0.94 0.71 16.0 Past 0.88 0.65 0.75 23.0 Present 0.80 0.57 0.67 14.0 - macro avg 0.75 0.72 0.71 53.0 -weighted avg 0.77 0.72 0.72 53.0 -``` \ No newline at end of file + macro-avg 0.75 0.72 0.71 53.0 +weighted-avg 0.77 0.72 0.72 53.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_treatment_binary_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_treatment_binary_wip_en.md index 534611df56ca7..54826da922b31 100644 --- a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_treatment_binary_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_treatment_binary_wip_en.md @@ -34,6 +34,7 @@ This model detects the assertion status of oncology treatment entities. The mode
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Surgery"", "Chemotherapy""]) + .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Surgery", "Chemotherapy"]) assertion = AssertionDLModel.pretrained("assertion_oncology_treatment_binary_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,7 +82,7 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() @@ -99,10 +100,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Cancer_Surgery"", "Chemotherapy"")) + .setWhiteList(Array("Cancer_Surgery", "Chemotherapy")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_treatment_binary_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +114,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient underwent a mastectomy two years ago. We recommend to start chemotherapy.").toDF("text") +val data = Seq("""The patient underwent a mastectomy two years ago. We recommend to start chemotherapy.""").toDF("text") val result = pipeline.fit(data).transform(data) @@ -153,6 +154,6 @@ In-house annotated oncology case reports. label precision recall f1-score support Hypothetical_Or_Absent 0.78 0.83 0.81 170.0 Present_Or_Past 0.81 0.76 0.78 160.0 - macro avg 0.80 0.79 0.79 330.0 - weighted avg 0.79 0.79 0.79 330.0 -``` \ No newline at end of file + macro-avg 0.80 0.79 0.79 330.0 + weighted-avg 0.79 0.79 0.79 330.0 +``` diff --git a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_wip_en.md b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_wip_en.md index 399ea6dea1f8f..90daa6b31f162 100644 --- a/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_wip_en.md +++ b/docs/_posts/mauro-nievoff/2022-10-11-assertion_oncology_wip_en.md @@ -34,6 +34,7 @@ This model detects the assertion status of entities related to oncology (includi
{% include programmingLanguageSelectScalaPythonNLU.html %} + ```python document_assembler = DocumentAssembler()\ .setInputCol("text")\ @@ -57,7 +58,7 @@ ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models") \ ner_converter = NerConverter() \ .setInputCols(["sentence", "token", "ner"]) \ - .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx"", "Tumor_Finding"", "Cancer_Surgery"", "Chemotherapy"", "Pathology_Test"", "Imaging_Test""]) + .setOutputCol("ner_chunk") .setWhiteList(["Cancer_Dx", "Tumor_Finding", "Cancer_Surgery", "Chemotherapy", "Pathology_Test", "Imaging_Test"]) assertion = AssertionDLModel.pretrained("assertion_oncology_wip", "en", "clinical/models") \ .setInputCols(["sentence", "ner_chunk", "embeddings"]) \ @@ -81,11 +82,11 @@ val document_assembler = new DocumentAssembler() .setOutputCol("document") val sentence_detector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models") - .setInputCols("document") + .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() - .setInputCols("sentence") + .setInputCols(Array("sentence")) .setOutputCol("token") val word_embeddings = WordEmbeddingsModel().pretrained("embeddings_clinical", "en", "clinical/models") @@ -99,10 +100,10 @@ val ner = MedicalNerModel.pretrained("ner_oncology_wip", "en", "clinical/models" val ner_converter = new NerConverter() .setInputCols(Array("sentence", "token", "ner")) .setOutputCol("ner_chunk") - .setWhiteList(Array("Cancer_Dx"", "Tumor_Finding"", "Cancer_Surgery"", "Chemotherapy"", "Pathology_Test"", "Imaging_Test"")) + .setWhiteList(Array("Cancer_Dx", "Tumor_Finding", "Cancer_Surgery", "Chemotherapy", "Pathology_Test", "Imaging_Test")) val clinical_assertion = AssertionDLModel.pretrained("assertion_oncology_wip","en","clinical/models") - .setInputCols("sentence","ner_chunk","embeddings") + .setInputCols(Array("sentence","ner_chunk","embeddings")) .setOutputCol("assertion") val pipeline = new Pipeline().setStages(Array(document_assembler, @@ -113,7 +114,7 @@ val pipeline = new Pipeline().setStages(Array(document_assembler, ner_converter, assertion)) -val data = Seq("The patient is suspected to have lung cancer. Family history is positive for other cancers. The result of the biopsy was positive.").toDF("text") +val data = Seq("""The patient is suspected to have lung cancer. Family history is positive for other cancers. The result of the biopsy was positive.""").toDF("text") val result = pipeline.fit(data).transform(data) @@ -159,6 +160,6 @@ Hypothetical 0.65 0.57 0.61 229.0 Past 0.90 0.91 0.91 2124.0 Possible 0.64 0.61 0.63 85.0 Present 0.87 0.88 0.88 2121.0 - macro avg 0.78 0.76 0.77 4967.0 -weighted avg 0.87 0.87 0.87 4967.0 -``` \ No newline at end of file + macro-avg 0.78 0.76 0.77 4967.0 +weighted-avg 0.87 0.87 0.87 4967.0 +``` diff --git a/docs/_posts/xusliebana/2021-01-29-deidentify_enriched_clinical_en.md b/docs/_posts/xusliebana/2021-01-29-deidentify_enriched_clinical_en.md index b39d2bbc0d786..4f8c882314d21 100644 --- a/docs/_posts/xusliebana/2021-01-29-deidentify_enriched_clinical_en.md +++ b/docs/_posts/xusliebana/2021-01-29-deidentify_enriched_clinical_en.md @@ -1,124 +1,168 @@ ---- -layout: model -title: Deidentify (Enriched) -author: John Snow Labs -name: deidentify_enriched_clinical -date: 2021-01-29 -task: De-identification -language: en -edition: Healthcare NLP 2.7.2 -spark_version: 2.4 -tags: [deidentify, en, obfuscation, licensed] -supported: true -article_header: -type: cover -use_language_switcher: "Python-Scala-Java" ---- - -## Description - -Deidentify (Large) is a deidentification model. It identifies instances of protected health information in text documents, and it can either obfuscate them (e.g., replacing names with different, fake names) or mask them (e.g., replacing “2020-06-04” with Some faker data). This model is useful for maintaining HIPAA compliance when dealing with text documents that contain protected health information. - -## Predicted Entities - -- PHONE -- PATIENT -- COUNTRY -- USERNAME -- LOCATION-OTHER -- DATE -- ID -- DOCTOR -- HOSPITAL -- IDNUM -- AGE -- MEDICALRECORD -- CITY -- FAX -- ZIP -- HEALTHPLAN -- PROFESSION -- BIOID -- URL -- EMAIL -- STATE -- ORGANIZATION -- STREET -- DEVICE - -{:.btn-box} - - -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/deidentify_enriched_clinical_en_2.7.2_2.4_1611917177874.zip){:.button.button-orange.button-orange-trans.arr.button-icon} - -## How to use - - - -
-{% include programmingLanguageSelectScalaPythonNLU.html %} -```python -nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, word_embeddings, clinical_ner, ner_converter]) - -text =''' -A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street -''' -result = model.transform(spark.createDataFrame([[text]]).toDF("text")) - -obfuscation = DeIdentificationModel.pretrained("deidentify_enriched_clinical", "en", "clinical/models") \ -.setInputCols(["sentence", "token", "ner_chunk"]) \ -.setOutputCol("obfuscated") \ -.setMode("obfuscate") - -obfusated_text = obfuscation.transform(result) - -``` -```scala -val nlpPipeline = new Pipeline().setStages(Array(documentAssembler, sentenceDetector, tokenizer, embeddings, clinical_sensitive_entities, nerConverter, de_identification)) - -val data = Seq("A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street").toDF("text") -val result = pipeline.fit(data).transform(data) - -val obfuscation = DeIdentificationModel.pretrained("deidentify_enriched_clinical", "en", "clinical/models") -.setInputCols(Array("sentence", "token", "ner_chunk")) -.setOutputCol("obfuscated") -.setMode("obfuscate") - -val obfusatedText = obfuscation.transform(result) -``` - - -{:.nlu-block} -```python -import nlu -nlu.load("en.de_identify.clinical").predict(""" -A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street -""") -``` - -
- -## Results - -```bash - sentence deidentified -0 A . A . -1 Record date : 2093-01-13 , David Hale , M.D . Record date : 2093-01-18 , DR. Gregory Kaiser , M.D . -2 , Name : Hendrickson , Ora MR . , Name : Joel Vasquez MR . -3 # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . # 67696 Date : 01/18/93 PCP : DR. Jennifer Eaton , 25 years-old , Record date : 2079-11-14 . -4 Cocke County Baptist Hospital . San Leandro Hospital – San Leandro . -5 0295 Keats Street 3744 Retreat Avenue -``` - -{:.model-param} -## Model Information - -{:.table-model} -|---|---| -|Model Name:|deidentify_enriched_clinical| -|Compatibility:|Spark NLP 2.7.2+| -|License:|Licensed| -|Edition:|Official| -|Input Labels:|[sentence, token, ner_chunk]| -|Output Labels:|[deidentified]| -|Language:|en| \ No newline at end of file +--- +layout: model +title: Deidentify (Enriched) +author: John Snow Labs +name: deidentify_enriched_clinical +date: 2021-01-29 +task: De-identification +language: en +edition: Spark NLP for Healthcare 2.7.2 +spark_version: 2.4 +tags: [deidentify, en, obfuscation, licensed] +supported: true +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Deidentify (Large) is a deidentification model. It identifies instances of protected health information in text documents, and it can either obfuscate them (e.g., replacing names with different, fake names) or mask them (e.g., replacing “2020-06-04” with Some faker data). This model is useful for maintaining HIPAA compliance when dealing with text documents that contain protected health information. + +## Predicted Entities + +- PHONE +- PATIENT +- COUNTRY +- USERNAME +- LOCATION-OTHER +- DATE +- ID +- DOCTOR +- HOSPITAL +- IDNUM +- AGE +- MEDICALRECORD +- CITY +- FAX +- ZIP +- HEALTHPLAN +- PROFESSION +- BIOID +- URL +- EMAIL +- STATE +- ORGANIZATION +- STREET +- DEVICE + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/clinical/models/deidentify_enriched_clinical_en_2.7.2_2.4_1611917177874.zip){:.button.button-orange.button-orange-trans.arr.button-icon} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler()\ + .setInputCol('text')\ + .setOutputCol('document') + +sentenceDetector = SentenceDetector() \ + .setInputCols(["document"]) \ + .setOutputCol("sentence") + +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\ + .setInputCols(["sentence", "token"])\ + .setOutputCol("embeddings") + +clinical_ner = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "embeddings"]) \ + .setOutputCol("ner") + +ner_converter = NerConverter() \ + .setInputCols(["sentence", "token", "ner"]) \ + .setOutputCol("ner_chunk") + +obfuscation = DeIdentificationModel.pretrained("deidentify_enriched_clinical", "en", "clinical/models") \ + .setInputCols(["sentence", "token", "ner_chunk"]) \ + .setOutputCol("obfuscated") \ + .setMode("obfuscate") + +nlp_pipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, word_embeddings, clinical_ner, ner_converter, obfuscation]) + +data = spark.createDataFrame([["""A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street"""]]).toDF("text") + +result = nlpPipeline.fit(data).transform(data) + +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val sentenceDetector = new SentenceDetector() + .setInputCols("document") + .setOutputCol("sentence") + +val tokenizer = new Tokenizer() + .setInputCols("sentence") + .setOutputCol("token") + +val word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token")) + .setOutputCol("embeddings") + +val clinical_ner = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "embeddings")) + .setOutputCol("ner") + +val ner_converter = new NerConverter() + .setInputCols(Array("sentence", "token", "ner")) + .setOutputCol("ner_chunk") + +val obfuscation = DeIdentificationModel.pretrained("deidentify_enriched_clinical", "en", "clinical/models") + .setInputCols(Array("sentence", "token", "ner_chunk")) + .setOutputCol("obfuscated") + .setMode("obfuscate") + +val nlpPipeline = new Pipeline().setStages(Array(documentAssembler, sentenceDetector, tokenizer, word_embeddings, clinical_ner, ner_converter, obfuscation)) + +val data = Seq("""A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street""").toDS.toDF("text") + +val result = nlpPipeline.fit(data).transform(data) + +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.de_identify.clinical").predict("""A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street""") +``` + +
+ +## Results + +```bash + sentence deidentified +0 A . A . +1 Record date : 2093-01-13 , David Hale , M.D . Record date : 2093-01-18 , DR. Gregory Kaiser , M.D . +2 , Name : Hendrickson , Ora MR . , Name : Joel Vasquez MR . +3 # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . # 67696 Date : 01/18/93 PCP : DR. Jennifer Eaton , 25 years-old , Record date : 2079-11-14 . +4 Cocke County Baptist Hospital . San Leandro Hospital – San Leandro . +5 0295 Keats Street 3744 Retreat Avenue +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deidentify_enriched_clinical| +|Compatibility:|Spark NLP 2.7.2+| +|License:|Licensed| +|Edition:|Official| +|Input Labels:|[sentence, token, ner_chunk]| +|Output Labels:|[deidentified]| +|Language:|en|