Merge pull request #197 from JohnSnowLabs/release/501

Release 501
JohnSnowLabs · Sep 11, 2023 · eb94532 · eb94532
2 parents c7d1508 + 726041d
commit eb94532
Show file tree

Hide file tree

Showing 8 changed files with 288 additions and 138 deletions.
diff --git a/nlu/__init__.py b/nlu/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '4.0.0'
+__version__ = '5.0.1'
 
 import nlu.utils.environment.env_utils as env_utils
 
@@ -205,7 +205,6 @@ def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool =
         if path is not None:
             logger.info(f'Trying to load nlu pipeline from local hard drive, located at {path}')
             pipe = load_nlu_pipe_from_hdd(path, request)
-            pipe.nlu_ref = request
             return pipe
     except Exception as err:
         if verbose:
@@ -271,6 +270,10 @@ def auth(HEALTHCARE_LICENSE_OR_JSON_PATH='/content/spark_nlp_for_healthcare.json
     return nlu
 
 
+def is_nlu_uid(uid: str):
+    return 'is_nlu_pipe' in uid
+
+
 def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
     """Either there is a pipeline of models in the path or just one singular model_anno_obj.
     If it is a component_list,  load the component_list and return it.
@@ -295,7 +298,7 @@ def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
     # if dbfs_path_exist(pipe_path):
     # Resource in path is a pipeline
     if is_pipe(pipe_path):
-        pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
+        pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
     # Resource in path is a single model_anno_obj
     elif is_model(pipe_path):
         c = offline_utils.verify_and_create_model(pipe_path)
@@ -304,13 +307,12 @@ def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
         return PipelineCompleter.check_and_fix_nlu_pipeline(pipe)
 
     else:
-        #fallback pipe
-        pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
+        # fallback pipe
+        pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
     for c in pipe_components: pipe.add(c, nlu_ref, pretrained_pipe_component=True)
     return pipe
 
 
-
 def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
     """Either there is a pipeline of models in the path or just one singular model_anno_obj.
     If it is a component_list,  load the component_list and return it.
@@ -319,14 +321,15 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
     if is_running_in_databricks():
         return load_nlu_pipe_from_hdd_in_databricks(pipe_path, request)
     pipe = NLUPipeline()
+    pipe.nlu_ref = request
     nlu_ref = request  # pipe_path
     if os.path.exists(pipe_path):
 
         # Resource in path is a pipeline
         if offline_utils.is_pipe(pipe_path):
             # language, nlp_ref, nlu_ref,path=None, is_licensed=False
             # todo deduct lang and if Licensed or not
-            pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
+            pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
         # Resource in path is a single model_anno_obj
         elif offline_utils.is_model(pipe_path):
             c = offline_utils.verify_and_create_model(pipe_path)
@@ -337,7 +340,16 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
             print(
                 f"Could not load model_anno_obj in path {pipe_path}. Make sure the jsl_folder contains either a stages subfolder or a metadata subfolder.")
             raise ValueError
-        for c in pipe_components: pipe.add(c, nlu_ref, pretrained_pipe_component=True)
+        for c in pipe_components:
+            pipe.add(c, nlu_ref, pretrained_pipe_component=True)
+        if is_nlu_uid(uid):
+            data = json.loads(uid)
+            print(data)
+            pipe.nlu_ref = data['0']['nlu_ref']
+            for i, c in enumerate(pipe.components):
+                c.nlu_ref = data[str(i + 1)]['nlu_ref']
+                c.nlp_ref = data[str(i + 1)]['nlp_ref']
+                c.loaded_from_pretrained_pipe = data[str(i + 1)]['loaded_from_pretrained_pipe']
         return pipe
 
     else:

diff --git a/nlu/pipe/component_resolution.py b/nlu/pipe/component_resolution.py
@@ -2,7 +2,7 @@
 Contains methods used to resolve a NLU reference to a NLU component_to_resolve.
 Handler for getting default components, etc.
 '''
-from typing import Dict, List, Union, Optional, Callable
+from typing import Dict, List, Union, Optional, Callable, Tuple
 
 from pyspark.ml import PipelineModel, Pipeline
 from sparknlp.pretrained import PretrainedPipeline, LightPipeline
@@ -155,7 +155,7 @@ def nlu_ref_to_component(nlu_ref, detect_lang=False, authenticated=False) -> Uni
     lang, nlu_ref, nlp_ref, license_type, is_pipe, model_params = nlu_ref_to_nlp_metadata(nlu_ref)
 
     if is_pipe:
-        resolved_component = get_trained_component_list_for_nlp_pipe_ref(lang, nlp_ref, nlu_ref,
+        resolved_component, _ = get_trained_component_list_for_nlp_pipe_ref(lang, nlp_ref, nlu_ref,
                                                                          license_type=license_type)
     else:
         resolved_component = get_trained_component_for_nlp_model_ref(lang, nlu_ref, nlp_ref, license_type, model_params)
@@ -179,7 +179,7 @@ def get_trainable_component_for_nlu_ref(nlu_ref) -> NluComponent:
 
 def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path=None,
                                                       license_type: LicenseType = Licenses.open_source,
-                                                      ) -> List[NluComponent]:
+                                                      ) -> Tuple[List[NluComponent],str]:
     """
     creates a list of components from a Spark NLP Pipeline reference
     1. download pipeline
@@ -190,7 +190,9 @@ def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path
     :param language: language of the pipeline
     :param nlp_ref: Reference to a spark nlp pretrained pipeline
     :param path: Load component_list from HDD
-    :return: Each element of the Spark NLP pipeline wrapped as a NLU component_to_resolve inside a list
+    :return: Tuple,
+                first element List of each element of the Spark NLP pipeline wrapped as a NLU component_to_resolve inside a list
+                second element UUID of the pipeline
     """
     logger.info(f'Building pretrained pipe for nlu_ref={nlu_ref} nlp_ref={nlp_ref}')
     if 'language' in nlp_ref:
@@ -199,16 +201,20 @@ def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path
     if path is None:
         if license_type != Licenses.open_source:
             pipe = PretrainedPipeline(nlp_ref, lang=language, remote_loc='clinical/models')
+            uid = pipe.model.uid
+
         else:
             pipe = PretrainedPipeline(nlp_ref, lang=language)
+            uid = pipe.model.uid
         iterable_stages = pipe.light_model.pipeline_model.stages
     else:
         pipe = LightPipeline(PipelineModel.load(path=path))
+        uid = pipe.pipeline_model.uid
         iterable_stages = pipe.pipeline_model.stages
     constructed_components = get_component_list_for_iterable_stages(iterable_stages, language, nlp_ref, nlu_ref,
                                                                     license_type)
     return ComponentUtils.set_storage_ref_attribute_of_embedding_converters(
-        PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components))
+        PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components)), uid
 
 
 

diff --git a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py
@@ -206,7 +206,6 @@ def extract_base_sparknlp_features(row: pd.Series, configs: SparkNLPExtractorCon
 
     return {**beginnings, **endings, **results, **annotator_types, **embeddings, **origins}  # Merge dicts NLP output
 
-
 def extract_sparknlp_metadata(row: pd.Series, configs: SparkNLPExtractorConfig) -> dict:
     """
     Extract base features common in all saprk NLP annotators

diff --git a/nlu/pipe/nlu_component.py b/nlu/pipe/nlu_component.py
@@ -72,6 +72,9 @@ def __init__(self,
                  trained_mirror_anno: Optional[JslAnnoId] = None,
                  applicable_file_types: List[str] = None,  # Used for OCR annotators to deduct applicable file types
                  is_trained: bool = True,  # Set to true for trainable annotators
+                 requires_binary_format: bool = False,  # Set to true for OCR annotators that require binary image format
+                 requires_image_format: bool = False,  # Set to true for OCR annotators that require image format
+                 is_visual_annotator: bool = False,  # Set to true for OCR annotators that require image format
                  ):
         self.name = name
         self.type = type
@@ -110,6 +113,9 @@ def __init__(self,
         self.trained_mirror_anno = trained_mirror_anno
         self.applicable_file_types = applicable_file_types
         self.is_trained = is_trained
+        self.requires_binary_format = requires_binary_format
+        self.requires_image_format = requires_image_format
+        self.is_visual_annotator = is_visual_annotator
 
     def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
                      nlu_ref: str,

diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py
@@ -1,3 +1,4 @@
+import json
 import logging
 from typing import Union
 
@@ -57,7 +58,8 @@ def __init__(self):
         self.has_span_classifiers = False
         self.prefer_light = False
         self.has_table_qa_models = False
-
+        self.requires_image_format = False
+        self.requires_binary_format = False
     def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False,
             name_to_add='', idx=None):
         '''
@@ -418,11 +420,26 @@ def drop_irrelevant_cols(self, cols, keep_origin_index=False):
         if keep_origin_index == False and 'origin_index' in cols: cols.remove('origin_index')
         return cols
 
-    def save(self, path, component='entire_pipeline', overwrite=False):
+    def save(self, path, component='entire_pipeline', overwrite=True):
+        # serialize data
+        data = {}
+        data[0] = {'nlu_ref': self.nlu_ref}
+        data['is_nlu_pipe'] = True
+        for i, c in enumerate(self.components):
+            data[i + 1] = {'nlu_ref': c.nlu_ref, 'nlp_ref': c.nlp_ref,
+                           'loaded_from_pretrained_pipe': c.loaded_from_pretrained_pipe}
+
+        data = json.dumps(data)
         if not self.is_fitted or not hasattr(self, 'vanilla_transformer_pipe'):
             self.fit()
             self.is_fitted = True
-
+        # self.vanilla_transformer_pipe.extractParamMap()
+        if hasattr(self, 'nlu_ref'):
+            """ ATTRS TO SAVE FOR EACH COMPONENT / PIPELINE: 
+            - nlp ref/nlu ref
+            - is loaded_form_pipe
+            """
+            self.vanilla_transformer_pipe._resetUid(data)
         if component == 'entire_pipeline':
             if overwrite:
                 self.vanilla_transformer_pipe.write().overwrite().save(path)

diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py
@@ -667,8 +667,13 @@ def add_metadata_to_pipe(pipe: NLUPipeline):
 
         for c in pipe.components:
             # Check for OCR componments
-            if c.jsl_anno_py_class in py_class_to_anno_id.keys():
+            if c.jsl_anno_py_class in py_class_to_anno_id.keys() or c.is_visual_annotator:
                 pipe.contains_ocr_components = True
+                if c.requires_image_format:
+                    pipe.requires_image_format = True
+                if c.requires_binary_format:
+                    pipe.requires_binary_format = True
+
             # Check for licensed components
             if c.license in [Licenses.ocr, Licenses.hc]:
                 pipe.has_licensed_components = True