Skip to content

Commit

Permalink
Merge pull request #197 from JohnSnowLabs/release/501
Browse files Browse the repository at this point in the history
Release 501
  • Loading branch information
C-K-Loan committed Sep 11, 2023
2 parents c7d1508 + 726041d commit eb94532
Show file tree
Hide file tree
Showing 8 changed files with 288 additions and 138 deletions.
28 changes: 20 additions & 8 deletions nlu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '4.0.0'
__version__ = '5.0.1'

import nlu.utils.environment.env_utils as env_utils

Expand Down Expand Up @@ -205,7 +205,6 @@ def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool =
if path is not None:
logger.info(f'Trying to load nlu pipeline from local hard drive, located at {path}')
pipe = load_nlu_pipe_from_hdd(path, request)
pipe.nlu_ref = request
return pipe
except Exception as err:
if verbose:
Expand Down Expand Up @@ -271,6 +270,10 @@ def auth(HEALTHCARE_LICENSE_OR_JSON_PATH='/content/spark_nlp_for_healthcare.json
return nlu


def is_nlu_uid(uid: str):
return 'is_nlu_pipe' in uid


def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
"""Either there is a pipeline of models in the path or just one singular model_anno_obj.
If it is a component_list, load the component_list and return it.
Expand All @@ -295,7 +298,7 @@ def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
# if dbfs_path_exist(pipe_path):
# Resource in path is a pipeline
if is_pipe(pipe_path):
pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
# Resource in path is a single model_anno_obj
elif is_model(pipe_path):
c = offline_utils.verify_and_create_model(pipe_path)
Expand All @@ -304,13 +307,12 @@ def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
return PipelineCompleter.check_and_fix_nlu_pipeline(pipe)

else:
#fallback pipe
pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
# fallback pipe
pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
for c in pipe_components: pipe.add(c, nlu_ref, pretrained_pipe_component=True)
return pipe



def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
"""Either there is a pipeline of models in the path or just one singular model_anno_obj.
If it is a component_list, load the component_list and return it.
Expand All @@ -319,14 +321,15 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
if is_running_in_databricks():
return load_nlu_pipe_from_hdd_in_databricks(pipe_path, request)
pipe = NLUPipeline()
pipe.nlu_ref = request
nlu_ref = request # pipe_path
if os.path.exists(pipe_path):

# Resource in path is a pipeline
if offline_utils.is_pipe(pipe_path):
# language, nlp_ref, nlu_ref,path=None, is_licensed=False
# todo deduct lang and if Licensed or not
pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
# Resource in path is a single model_anno_obj
elif offline_utils.is_model(pipe_path):
c = offline_utils.verify_and_create_model(pipe_path)
Expand All @@ -337,7 +340,16 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
print(
f"Could not load model_anno_obj in path {pipe_path}. Make sure the jsl_folder contains either a stages subfolder or a metadata subfolder.")
raise ValueError
for c in pipe_components: pipe.add(c, nlu_ref, pretrained_pipe_component=True)
for c in pipe_components:
pipe.add(c, nlu_ref, pretrained_pipe_component=True)
if is_nlu_uid(uid):
data = json.loads(uid)
print(data)
pipe.nlu_ref = data['0']['nlu_ref']
for i, c in enumerate(pipe.components):
c.nlu_ref = data[str(i + 1)]['nlu_ref']
c.nlp_ref = data[str(i + 1)]['nlp_ref']
c.loaded_from_pretrained_pipe = data[str(i + 1)]['loaded_from_pretrained_pipe']
return pipe

else:
Expand Down
16 changes: 11 additions & 5 deletions nlu/pipe/component_resolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Contains methods used to resolve a NLU reference to a NLU component_to_resolve.
Handler for getting default components, etc.
'''
from typing import Dict, List, Union, Optional, Callable
from typing import Dict, List, Union, Optional, Callable, Tuple

from pyspark.ml import PipelineModel, Pipeline
from sparknlp.pretrained import PretrainedPipeline, LightPipeline
Expand Down Expand Up @@ -155,7 +155,7 @@ def nlu_ref_to_component(nlu_ref, detect_lang=False, authenticated=False) -> Uni
lang, nlu_ref, nlp_ref, license_type, is_pipe, model_params = nlu_ref_to_nlp_metadata(nlu_ref)

if is_pipe:
resolved_component = get_trained_component_list_for_nlp_pipe_ref(lang, nlp_ref, nlu_ref,
resolved_component, _ = get_trained_component_list_for_nlp_pipe_ref(lang, nlp_ref, nlu_ref,
license_type=license_type)
else:
resolved_component = get_trained_component_for_nlp_model_ref(lang, nlu_ref, nlp_ref, license_type, model_params)
Expand All @@ -179,7 +179,7 @@ def get_trainable_component_for_nlu_ref(nlu_ref) -> NluComponent:

def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path=None,
license_type: LicenseType = Licenses.open_source,
) -> List[NluComponent]:
) -> Tuple[List[NluComponent],str]:
"""
creates a list of components from a Spark NLP Pipeline reference
1. download pipeline
Expand All @@ -190,7 +190,9 @@ def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path
:param language: language of the pipeline
:param nlp_ref: Reference to a spark nlp pretrained pipeline
:param path: Load component_list from HDD
:return: Each element of the Spark NLP pipeline wrapped as a NLU component_to_resolve inside a list
:return: Tuple,
first element List of each element of the Spark NLP pipeline wrapped as a NLU component_to_resolve inside a list
second element UUID of the pipeline
"""
logger.info(f'Building pretrained pipe for nlu_ref={nlu_ref} nlp_ref={nlp_ref}')
if 'language' in nlp_ref:
Expand All @@ -199,16 +201,20 @@ def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path
if path is None:
if license_type != Licenses.open_source:
pipe = PretrainedPipeline(nlp_ref, lang=language, remote_loc='clinical/models')
uid = pipe.model.uid

else:
pipe = PretrainedPipeline(nlp_ref, lang=language)
uid = pipe.model.uid
iterable_stages = pipe.light_model.pipeline_model.stages
else:
pipe = LightPipeline(PipelineModel.load(path=path))
uid = pipe.pipeline_model.uid
iterable_stages = pipe.pipeline_model.stages
constructed_components = get_component_list_for_iterable_stages(iterable_stages, language, nlp_ref, nlu_ref,
license_type)
return ComponentUtils.set_storage_ref_attribute_of_embedding_converters(
PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components))
PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components)), uid



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@ def extract_base_sparknlp_features(row: pd.Series, configs: SparkNLPExtractorCon

return {**beginnings, **endings, **results, **annotator_types, **embeddings, **origins} # Merge dicts NLP output


def extract_sparknlp_metadata(row: pd.Series, configs: SparkNLPExtractorConfig) -> dict:
"""
Extract base features common in all saprk NLP annotators
Expand Down
6 changes: 6 additions & 0 deletions nlu/pipe/nlu_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ def __init__(self,
trained_mirror_anno: Optional[JslAnnoId] = None,
applicable_file_types: List[str] = None, # Used for OCR annotators to deduct applicable file types
is_trained: bool = True, # Set to true for trainable annotators
requires_binary_format: bool = False, # Set to true for OCR annotators that require binary image format
requires_image_format: bool = False, # Set to true for OCR annotators that require image format
is_visual_annotator: bool = False, # Set to true for OCR annotators that require image format
):
self.name = name
self.type = type
Expand Down Expand Up @@ -110,6 +113,9 @@ def __init__(self,
self.trained_mirror_anno = trained_mirror_anno
self.applicable_file_types = applicable_file_types
self.is_trained = is_trained
self.requires_binary_format = requires_binary_format
self.requires_image_format = requires_image_format
self.is_visual_annotator = is_visual_annotator

def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
nlu_ref: str,
Expand Down
23 changes: 20 additions & 3 deletions nlu/pipe/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
from typing import Union

Expand Down Expand Up @@ -57,7 +58,8 @@ def __init__(self):
self.has_span_classifiers = False
self.prefer_light = False
self.has_table_qa_models = False

self.requires_image_format = False
self.requires_binary_format = False
def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False,
name_to_add='', idx=None):
'''
Expand Down Expand Up @@ -418,11 +420,26 @@ def drop_irrelevant_cols(self, cols, keep_origin_index=False):
if keep_origin_index == False and 'origin_index' in cols: cols.remove('origin_index')
return cols

def save(self, path, component='entire_pipeline', overwrite=False):
def save(self, path, component='entire_pipeline', overwrite=True):
# serialize data
data = {}
data[0] = {'nlu_ref': self.nlu_ref}
data['is_nlu_pipe'] = True
for i, c in enumerate(self.components):
data[i + 1] = {'nlu_ref': c.nlu_ref, 'nlp_ref': c.nlp_ref,
'loaded_from_pretrained_pipe': c.loaded_from_pretrained_pipe}

data = json.dumps(data)
if not self.is_fitted or not hasattr(self, 'vanilla_transformer_pipe'):
self.fit()
self.is_fitted = True

# self.vanilla_transformer_pipe.extractParamMap()
if hasattr(self, 'nlu_ref'):
""" ATTRS TO SAVE FOR EACH COMPONENT / PIPELINE:
- nlp ref/nlu ref
- is loaded_form_pipe
"""
self.vanilla_transformer_pipe._resetUid(data)
if component == 'entire_pipeline':
if overwrite:
self.vanilla_transformer_pipe.write().overwrite().save(path)
Expand Down
7 changes: 6 additions & 1 deletion nlu/pipe/utils/pipe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,8 +667,13 @@ def add_metadata_to_pipe(pipe: NLUPipeline):

for c in pipe.components:
# Check for OCR componments
if c.jsl_anno_py_class in py_class_to_anno_id.keys():
if c.jsl_anno_py_class in py_class_to_anno_id.keys() or c.is_visual_annotator:
pipe.contains_ocr_components = True
if c.requires_image_format:
pipe.requires_image_format = True
if c.requires_binary_format:
pipe.requires_binary_format = True

# Check for licensed components
if c.license in [Licenses.ocr, Licenses.hc]:
pipe.has_licensed_components = True
Expand Down
Loading

0 comments on commit eb94532

Please sign in to comment.