Merge pull request #247 from JohnSnowLabs/release/514

Release/514
JohnSnowLabs · Feb 8, 2024 · 6f6691a · 6f6691a
2 parents 6523f6c + 2be6aae
commit 6f6691a
Show file tree

Hide file tree

Showing 14 changed files with 99 additions and 13 deletions.
diff --git a/nlu/__init__.py b/nlu/__init__.py
@@ -1,5 +1,4 @@
-__version__ = '5.1.3'
-
+__version__ = '5.1.4'
 
 
 import nlu.utils.environment.env_utils as env_utils

diff --git a/nlu/components/classifiers/span_medical/__init__.py b/nlu/components/classifiers/span_medical/__init__.py
diff --git a/nlu/components/classifiers/span_medical/span_medical.py b/nlu/components/classifiers/span_medical/span_medical.py
@@ -0,0 +1,18 @@
+class SpanMedical:
+    @staticmethod
+    def get_default_model():
+        from sparknlp_jsl.annotator import MedicalQuestionAnswering
+
+        return MedicalQuestionAnswering.pretrained() \
+            .setInputCols(["document_question", "context"]) \
+            .setOutputCol("answer")
+
+
+
+    @staticmethod
+    def get_pretrained_model(name, language, bucket=None):
+        from sparknlp_jsl.annotator import MedicalQuestionAnswering
+
+        return MedicalQuestionAnswering.pretrained(name, language, bucket) \
+            .setInputCols(["document_question", "context"]) \
+            .setOutputCol("answer")
diff --git a/nlu/pipe/col_substitution/col_substitution_HC.py b/nlu/pipe/col_substitution/col_substitution_HC.py
@@ -395,3 +395,28 @@ def substitute_generic_classifier_parser_cols(c, cols, is_unique=True, nlu_ident
             logger.info(f'Dropping unmatched metadata_col={col} for c={c}')
         # new_cols[col]= f"{new_base_name}_confidence"
     return new_cols
+def substitute_hc_span_classifier_cols(c, cols, nlu_identifier=True):
+    """
+    QA classifier
+    """
+    new_cols = {}
+    #new_base_name = 'answer' if nlu_identifier == 'UNIQUE' else f'{nlu_identifier}_answer'
+    new_base_name = 'answer'
+    for col in cols:
+        if 'answer_results' in col:
+            new_cols[col] = f'{new_base_name}'
+        if 'answer_results_score' in col:
+            new_cols[col] = f'{new_base_name}_confidence'
+
+        elif 'span_start_score' in col:
+            new_cols[col] = f'{new_base_name}_start_confidence'
+        elif 'span_end_score' in col:
+            new_cols[col] = f'{new_base_name}_end_confidence'
+        elif 'start' in col and not 'score' in col:
+            new_cols[col] = f'{new_base_name}_start'
+        elif 'end' in col and not 'score' in col:
+            new_cols[col] = f'{new_base_name}_end'
+        elif 'sentence' in col:
+            new_cols[col] = f'{new_base_name}_sentence'
+
+    return new_cols
diff --git a/nlu/pipe/nlu_component.py b/nlu/pipe/nlu_component.py
@@ -76,6 +76,7 @@ def __init__(self,
                  requires_image_format: bool = False,  # Set to true for OCR annotators that require image format
                  is_visual_annotator: bool = False,  # Set to true for OCR annotators that require image format
                  is_light_pipe_incompatible: bool = False,  # Set to true for OCR annotators that require image format
+                 prefer_light_pipe: bool = False,  # Set True for annos that should run in light pipe
                  ):
         self.name = name
         self.type = type
@@ -118,6 +119,7 @@ def __init__(self,
         self.requires_image_format = requires_image_format
         self.is_visual_annotator = is_visual_annotator
         self.is_light_pipe_incompatible = is_light_pipe_incompatible
+        self.prefer_light_pipe = prefer_light_pipe
 
     def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
                      nlu_ref: str,

diff --git a/nlu/pipe/utils/data_conversion_utils.py b/nlu/pipe/utils/data_conversion_utils.py
@@ -13,18 +13,25 @@
 from pyspark.sql.types import StringType, StructType, StructField
 
 
+class NluDataParseException(Exception):
+    """Custom exception class"""
+
+    def __init__(self, message="An error occurred parsing data with NLU"):
+        self.message = message
+        super().__init__(self.message)
+
 class DataConversionUtils:
     # Modin aswell but optional, so we dont import the type yet
     supported_types = [pyspark.sql.DataFrame, pd.DataFrame, pd.Series, np.ndarray]
 
     @staticmethod
     def except_text_col_not_found(cols):
-        raise ValueError(
+        raise NluDataParseException(
             f'Could not find column named "text" in input Pandas Dataframe. Please ensure one column named such exists. Columns in DF are : {cols} ')
 
     @staticmethod
     def except_invalid_question_data_format(cols):
-        raise ValueError(
+        raise NluDataParseException(
             f'You input data format is invalid for question answering with span classification.'
             f'Make sure you have at least 2 columns in you dataset, named context/question  for pandas Dataframes'
             f'For Strings/Iterables/Tuples make sure to use the format `question|||context` or (question,context) ')
@@ -301,7 +308,6 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is
                 # TODO invalid Table Data Format Exception
                 pass
             if isinstance(data[0], str):
-
                 return DataConversionUtils.table_question_str_to_sdf(data, spark_sess)
             if isinstance(data[0], pd.DataFrame):
                 return DataConversionUtils.table_question_pdf_to_sdf(data, spark_sess)
@@ -321,6 +327,8 @@ def to_spark_df(data, spark_sess, raw_text_column='text', is_span_data=False, is
                         return DataConversionUtils.question_tuple_iterable_to_sdf(data, spark_sess)
                     elif isinstance(data[0], str):
                         return DataConversionUtils.question_str_iterable_to_sdf(data, spark_sess)
+            except NluDataParseException as err :
+                raise err
             except:
                 ValueError("Data could not be converted to Spark Dataframe for internal conversion.")
         else:

diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py
@@ -668,6 +668,9 @@ def add_metadata_to_pipe(pipe: NLUPipeline):
 
         for c in pipe.components:
             # Check for OCR componments
+            if c.prefer_light_pipe:
+                pipe.prefer_light = True
+
             if c.jsl_anno_py_class in py_class_to_anno_id.keys() or c.is_visual_annotator:
                 pipe.contains_ocr_components = True
                 if c.requires_image_format:

diff --git a/nlu/pipe/utils/predict_helper.py b/nlu/pipe/utils/predict_helper.py
@@ -8,7 +8,7 @@
 from sparknlp.common import AnnotatorType
 
 from nlu.pipe.utils.audio_data_conversion_utils import AudioDataConversionUtils
-from nlu.pipe.utils.data_conversion_utils import DataConversionUtils
+from nlu.pipe.utils.data_conversion_utils import DataConversionUtils, NluDataParseException
 from nlu.pipe.utils.ocr_data_conversion_utils import OcrDataConversionUtils
 
 logger = logging.getLogger('nlu')
@@ -211,8 +211,6 @@ def __db_endpoint_predict__(pipe, data):
     1) parse pred params from first row maybe
     2) serialize/deserialize img
     """
-    print("CUSOTM NLU MODE!")
-    print(data.columns)
     params = PredictParams.maybe_from_pandas_df(data)
     if params:
         params = params.dict()
@@ -366,12 +364,14 @@ def __predict__(pipe, data, output_level, positions, keep_stranger_features, met
         try:
             return __predict_standard_spark(pipe, data, output_level, positions, keep_stranger_features, metadata,
                                             drop_irrelevant_cols, return_spark_df, get_embeddings)
+        except NluDataParseException as err:
+            logger.warning(f"Predictions Failed={err}")
+            raise err
         except Exception as err:
             logger.warning(f"Predictions Failed={err}")
             pipe.print_exception_err(err)
             raise Exception("Failure to process data with NLU")
 
-
 def debug_print_pipe_cols(pipe):
     for c in pipe.components:
         print(f'{c.spark_input_column_names}->{c.name}->{c.spark_output_column_names}')
diff --git a/nlu/spellbook.py b/nlu/spellbook.py
@@ -10598,7 +10598,7 @@ class Spellbook:
                'de.deid.pipeline': 'german_deid_pipeline_spark24',
                'de.med_ner.deid_generic.pipeline': 'ner_deid_generic_pipeline'},
         'en': {
-
+            'en.answer_question.clinical_notes_onnx.pipeline': 'clinical_notes_qa_base_onnx_pipeline',
             'en.classify.bert_sequence.binary_rct_biobert.pipeline': 'bert_sequence_classifier_binary_rct_biobert_pipeline',
             'en.classify.bert_sequence.vop_hcp_consult.pipeline': 'bert_sequence_classifier_vop_hcp_consult_pipeline',
             'en.classify.bert_sequence.vop_drug_side_effect.pipeline': 'bert_sequence_classifier_vop_drug_side_effect_pipeline',
@@ -10634,6 +10634,7 @@ class Spellbook:
             'en.explain_doc.clinical_ade': 'explain_clinical_doc_ade',
             'en.explain_doc.clinical_radiology.pipeline': 'explain_clinical_doc_radiology',
             'en.explain_doc.era': 'explain_clinical_doc_era',
+            'en.explain_doc.clinical_granular': 'explain_clinical_doc_granular',
             'en.icd10_icd9.mapping': 'icd10_icd9_mapping',
             'en.icd10cm.umls.mapping': 'icd10cm_umls_mapping',
             'en.icd10cm_resolver.pipeline': 'icd10cm_resolver_pipeline',
@@ -10761,10 +10762,12 @@ class Spellbook:
             'en.resolve.medication': 'medication_resolver_pipeline',
             'en.resolve.medication_transform.pipeline': 'medication_resolver_transform_pipeline',
             'en.rxnorm.umls.mapping': 'rxnorm_umls_mapping',
+            'en.rxnorm.mes.mapping': 'rxnorm_mesh_mapping',
             'en.snomed.umls.mapping': 'snomed_umls_mapping',
             'en.spell.clinical.pipeline': 'spellcheck_clinical_pipeline',
             'en.summarize.biomedical_pubmed.pipeline':'summarizer_biomedical_pubmed_pipeline',
             'en.summarize.clinical_guidelines_large.pipeline': 'summarizer_clinical_guidelines_large_pipeline',
+            'en.summarize.clinical_laymen_onnx.pipeline': 'summarizer_clinical_laymen_onnx_pipeline',
             'en.summarize.clinical_jsl_augmented.pipeline': 'summarizer_clinical_jsl_augmented_pipeline',
             'en.summarize.clinical_questions.pipeline': 'summarizer_clinical_questions_pipeline',
             'en.summarize.generic_jsl.pipeline': 'summarizer_generic_jsl_pipeline',
@@ -11105,6 +11108,7 @@ class Spellbook:
                                                   'en.med_ner.tumour': 'nerdl_tumour_demo',
                                                   'en.med_ner.vop': 'ner_vop',
                                                   'en.med_ner.vop_emb_clinical_large': 'ner_vop_emb_clinical_large',
+                                                  'en.med_ner.vop_langtest': 'ner_vop_langtest',
                                                   'en.mesh_to_umls': 'mesh_umls_mapper',
                                                   'en.ner.clinical_trials_abstracts': 'ner_clinical_trials_abstracts',
                                                   'en.ner.drug_development_trials': 'bert_token_classifier_drug_development_trials',
@@ -16830,6 +16834,7 @@ class Spellbook:
                              'nerdl_tumour_demo': 'MedicalNerModel',
                              'ner_vop': 'MedicalNerModel',
                              'ner_vop_emb_clinical_large': 'MedicalNerModel',
+                             'ner_vop_langtest': 'MedicalNerModel',
                              'ngram': 'NGramGenerator',
                              'nl': 'RoBertaEmbeddings',
                              'nli_mpnet_base_v2': 'MPNetEmbeddings',

diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -15,7 +15,7 @@ class AnnoClassRef:
     JSL_anno2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {
 
         A_N.E5_SENTENCE_EMBEDDINGS: 'E5Embeddings',
-        A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS:'InstructorEmbeddings',
+        A_N.INSTRUCTOR_SENTENCE_EMBEDDINGS: 'InstructorEmbeddings',
 
         A_N.WHISPER_FOR_CTC: 'WhisperForCTC',
         A_N.HUBERT_FOR_CTC: 'HubertForCTC',
@@ -240,6 +240,7 @@ class AnnoClassRef:
 
     }
     JSL_anno_HC_ref_2_py_class: Dict[JslAnnoId, JslAnnoPyClass] = {
+        HC_A_N.MEDICAL_QUESTION_ANSWERING: 'MedicalQuestionAnswering',
         HC_A_N.MEDICAL_TEXT_GENERATOR: 'MedicalTextGenerator',
         HC_A_N.MEDICAL_SUMMARIZER:'MedicalSummarizer',
         HC_A_N.ZERO_SHOT_NER: 'ZeroShotNerModel',

diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py
@@ -45,6 +45,7 @@
 from nlu.components.classifiers.span_longformer.span_longformer import SpanLongFormerClassifier
 from nlu.components.classifiers.span_roberta.span_roberta import SpanRobertaClassifier
 from nlu.components.classifiers.span_xlm_roberta.span_xlm_roberta import SpanXlmRobertaClassifier
+from nlu.components.classifiers.span_medical.span_medical import SpanMedical
 from nlu.components.classifiers.token_albert.token_albert import TokenAlbert
 from nlu.components.classifiers.token_bert.token_bert import TokenBert
 from nlu.components.classifiers.token_bert_healthcare.token_bert_healthcare import TokenBertHealthcare
@@ -3278,6 +3279,27 @@ class ComponentUniverse:
                                                       computation_context=ComputeContexts.spark,
                                                       output_context=ComputeContexts.spark,
                                                       ),
+        H_A.MEDICAL_QUESTION_ANSWERING: partial(NluComponent,
+                                                      name=H_A.MEDICAL_QUESTION_ANSWERING,
+                                                      jsl_anno_class_id= H_A.MEDICAL_QUESTION_ANSWERING,
+                                                      jsl_anno_py_class= ACR.JSL_anno_HC_ref_2_py_class[
+                                                          H_A.MEDICAL_QUESTION_ANSWERING],
+                                                      node= NLP_HC_FEATURE_NODES.nodes[
+                                                          H_A.MEDICAL_QUESTION_ANSWERING],
+                                                      get_default_model= SpanMedical.get_default_model,
+                                                      get_pretrained_model= SpanMedical.get_pretrained_model,
+                                                      type= T.QUESTION_SPAN_CLASSIFIER,
+                                                      pdf_extractor_methods={
+                                                          'default': default_span_classifier_config,
+                                                          'default_full': default_full_span_classifier_config, },
+                                                      pdf_col_name_substitutor=substitute_hc_span_classifier_cols,
+                                                      output_level=L.INPUT_DEPENDENT_DOCUMENT_CLASSIFIER,
+                                                      description='TODO',
+                                                      provider=ComponentBackends.hc,
+                                                      license=Licenses.hc,
+                                                      computation_context=ComputeContexts.spark,
+                                                      output_context=ComputeContexts.spark,
+                                                      ),
 
         A.MULTI_DOCUMENT_ASSEMBLER: partial(NluComponent,
                                             name=A.MULTI_DOCUMENT_ASSEMBLER,
@@ -3880,6 +3902,7 @@ class ComponentUniverse:
                                             ),
 
         H_A.CHUNK_MAPPER_MODEL: partial(NluComponent,
+                                        prefer_light_pipe=True,
                                         name=H_A.CHUNK_MAPPER_MODEL,
                                         type=T.CHUNK_MAPPER,
                                         get_default_model=ChunkMapper.get_default_model,

diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py
@@ -303,6 +303,7 @@ class NLP_HC_NODE_IDS:  # or Mode Node?
     ENTITY_CHUNK_EMBEDDING = JslAnnoId('entity_chunk_embedding')
     MEDICAL_SUMMARIZER = JslAnnoId('med_summarizer')
     MEDICAL_TEXT_GENERATOR = JslAnnoId('med_text_generator')
+    MEDICAL_QUESTION_ANSWERING = JslAnnoId('med_question_answering')
 
 class OCR_NODE_IDS:
     """All available Feature nodes in OCR

diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py
@@ -392,6 +392,8 @@ class NLP_HC_FEATURE_NODES():
     H_F = NLP_HC_FEATURES
     # HC Feature Nodes
     nodes = {
+        A.MEDICAL_QUESTION_ANSWERING: NlpFeatureNode(A.MEDICAL_QUESTION_ANSWERING, [F.DOCUMENT_QUESTION, F.DOCUMENT_QUESTION_CONTEXT], [F.CLASSIFIED_SPAN]),
+
         A.MEDICAL_TEXT_GENERATOR: NlpFeatureNode(A.MEDICAL_TEXT_GENERATOR, [F.DOCUMENT], [F.DOCUMENT_GENERATED]),
 
         A.MEDICAL_SUMMARIZER: NlpFeatureNode(A.MEDICAL_SUMMARIZER, [F.DOCUMENT], [F.DOCUMENT_GENERATED]),

diff --git a/setup.py b/setup.py
@@ -27,8 +27,7 @@
 
     name='nlu',
 
-    version='5.1.3',
-
+    version='5.1.4',
 
     description='John Snow Labs NLU provides state of the art algorithms for NLP&NLU with 20000+ of pretrained models in 200+ languages. It enables swift and simple development and research with its powerful Pythonic and Keras inspired API. It is powerd by John Snow Labs powerful Spark NLP library.',