Merge pull request #265 from JohnSnowLabs/release/532

Release/532
JohnSnowLabs · May 21, 2024 · 506860e · 506860e
2 parents 76161f0 + 8564e7a
commit 506860e
Show file tree

Hide file tree

Showing 6 changed files with 24 additions and 27 deletions.
diff --git a/nlu/__init__.py b/nlu/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '5.3.1'
+__version__ = '5.3.2'
 
 
 import nlu.utils.environment.env_utils as env_utils
@@ -325,7 +325,7 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
     If it is a component_list,  load the component_list and return it.
     If it is a singular model_anno_obj, load it to the correct AnnotatorClass and NLU component_to_resolve and then generate pipeline for it
     """
-    if is_running_in_databricks():
+    if is_running_in_databricks_runtime():
         return load_nlu_pipe_from_hdd_in_databricks(pipe_path, request)
     pipe = NLUPipeline()
     pipe.nlu_ref = request

diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py
@@ -18,7 +18,7 @@
 from nlu.pipe.utils.resolution.storage_ref_utils import StorageRefUtils
 from nlu.universe.feature_node_ids import NLP_NODE_IDS
 from nlu.universe.universes import Licenses
-from nlu.utils.environment.env_utils import is_running_in_databricks, try_import_streamlit
+from nlu.utils.environment.env_utils import is_running_in_databricks_runtime, try_import_streamlit
 
 logger = logging.getLogger('nlu')
 
@@ -608,7 +608,7 @@ def viz(self, text_to_viz: str, viz_type='', labels_to_viz=None, viz_colors={},
         from nlu.utils.environment.env_utils import install_and_import_package
         install_and_import_package('spark-nlp-display', import_name='sparknlp_display')
         if self.vanilla_transformer_pipe is None: self.fit()
-        is_databricks_env = is_running_in_databricks()
+        is_databricks_env = is_running_in_databricks_runtime()
         if return_html: is_databricks_env = True
         # self.configure_light_pipe_usage(1, force=True)
         from nlu.pipe.viz.vis_utils import VizUtils

diff --git a/nlu/pipe/utils/audio_data_conversion_utils.py b/nlu/pipe/utils/audio_data_conversion_utils.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 import pyspark
-from johnsnowlabs.utils.env_utils import is_running_in_databricks
+from johnsnowlabs.utils.env_utils import is_running_in_databricks_runtime
 from pyspark.sql.types import *
 
 logger = logging.getLogger('nlu')
@@ -34,7 +34,7 @@ def validate_paths(data):
     @staticmethod
     def check_iterable_paths_are_valid(iterable_paths):
         """Validate for iterable data input if all elements point to file or jsl_folder"""
-        if is_running_in_databricks():
+        if is_running_in_databricks_runtime():
             iterable_paths = [f'/dbfs{p}' for p in iterable_paths]
         paths_validness = []
         for p in iterable_paths:
@@ -86,18 +86,18 @@ def glob_files_of_accepted_type(paths, file_types):
         1. paths point to a file which is suffixed with one of the accepted file_types, i.e. path/to/file.type
         2. path points to a jsl_folder, in this case jsl_folder is recursively searched for valid files and accepted paths will be in return result
         """
-        if is_running_in_databricks():
+        if is_running_in_databricks_runtime():
             paths = [f'/dbfs{p}' for p in paths]
         accepted_file_paths = []
         for p in paths:
             for t in file_types:
                 t = t.lower()
-                if os.path.isfile(p) or is_running_in_databricks() and os.path.isfile(f'/dbfs{p}'):
+                if os.path.isfile(p) or is_running_in_databricks_runtime() and os.path.isfile(f'/dbfs{p}'):
                     if p.lower().split('.')[-1] == t:
-                        if is_running_in_databricks():
+                        if is_running_in_databricks_runtime():
                             p = p.replace('/dbfs', '', 1)
                         accepted_file_paths.append(p)
-                elif os.path.isdir(p) or is_running_in_databricks() and os.path.isdir(f'/dbfs{p}'):
+                elif os.path.isdir(p) or is_running_in_databricks_runtime() and os.path.isdir(f'/dbfs{p}'):
                     accepted_file_paths += glob.glob(p + f'/**/*.{t}', recursive=True)
                 else:
                     print(f"Invalid path = {p} pointing neither to file or jsl_folder on this machine")

diff --git a/nlu/pipe/utils/ocr_data_conversion_utils.py b/nlu/pipe/utils/ocr_data_conversion_utils.py
@@ -7,7 +7,7 @@
 import numpy as np
 import pandas as pd
 import pyspark
-from johnsnowlabs.utils.env_utils import is_running_in_databricks
+from johnsnowlabs.utils.env_utils import is_running_in_databricks_runtime
 
 logger = logging.getLogger('nlu')
 
@@ -32,7 +32,7 @@ def validate_OCR_compatible_inputs(data):
     @staticmethod
     def check_iterable_paths_are_valid(iterable_paths):
         """Validate for iterable data input if all elements point to file or jsl_folder"""
-        if is_running_in_databricks():
+        if is_running_in_databricks_runtime():
             iterable_paths = [f'/dbfs{p}' for p in iterable_paths]
         paths_validness = []
         for p in iterable_paths:
@@ -58,18 +58,18 @@ def glob_files_of_accepted_type(paths, file_types):
         2. path points to a jsl_folder, in this case jsl_folder is recurisvely searched for valid files and accepted paths will be in return result
         """
 
-        if is_running_in_databricks():
+        if is_running_in_databricks_runtime():
             paths = [f'/dbfs{p}' for p in paths]
         accepted_file_paths = []
         for p in paths:
             for t in file_types:
                 t = t.lower()
-                if os.path.isfile(p) or is_running_in_databricks() and os.path.isfile(f'/dbfs{p}'):
+                if os.path.isfile(p) or is_running_in_databricks_runtime() and os.path.isfile(f'/dbfs{p}'):
                     if p.lower().split('.')[-1] == t:
-                        if is_running_in_databricks():
+                        if is_running_in_databricks_runtime():
                             p = p.replace('/dbfs', '', 1)
                         accepted_file_paths.append(p)
-                elif os.path.isdir(p) or is_running_in_databricks() and os.path.isdir(f'/dbfs{p}'):
+                elif os.path.isdir(p) or is_running_in_databricks_runtime() and os.path.isdir(f'/dbfs{p}'):
                     accepted_file_paths += glob.glob(p + f'/*.{t.upper()}', recursive=True) + glob.glob(p + f'/*.{t}',
                                                                                                         recursive=True)
                 else:

diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py
@@ -18,7 +18,7 @@
 from nlu.pipe.utils.component_utils import ComponentUtils
 from typing import List, Union, Dict
 from nlu.universe.annotator_class_universe import AnnoClassRef
-from nlu.utils.environment.env_utils import is_running_in_databricks
+from nlu.utils.environment.env_utils import is_running_in_databricks_runtime
 import os
 import glob
 import json
@@ -140,12 +140,12 @@ def set_column_values_on_components_from_pretrained_pipe_from_disk_data(componen
             pipe_path = glob.glob(f'{pipe_path}*')
             if len(pipe_path) == 0:
                 # try databricks env path
-                if is_running_in_databricks():
+                if is_running_in_databricks_runtime():
                     pipe_path = [f'dbfs:/root/cache_pretrained/{nlp_ref}_{lang}']
                 else:
                     raise FileNotFoundError(f"Could not find downloaded Pipeline at path={pipe_path}")
             pipe_path = pipe_path[0]
-            if not os.path.exists(pipe_path) and not is_running_in_databricks():
+            if not os.path.exists(pipe_path) and not is_running_in_databricks_runtime():
                 raise FileNotFoundError(f"Could not find downloaded Pipeline at path={pipe_path}")
 
         # Find HDD location of component_list and read out input/output cols
@@ -155,7 +155,7 @@ def set_column_values_on_components_from_pretrained_pipe_from_disk_data(componen
 
         for c in component_list:
             model_name = c.model.uid.split('_')[0]
-            if is_running_in_databricks():
+            if is_running_in_databricks_runtime():
                 data = PipeUtils.get_json_data_for_pipe_model_at_stage_number_on_databricks(nlp_ref, lang, digit_str)
             else:
                 data = PipeUtils.get_json_data_for_pipe_model_at_stage_number(pipe_path, digit_str)

diff --git a/nlu/utils/environment/env_utils.py b/nlu/utils/environment/env_utils.py
@@ -84,13 +84,10 @@ def try_import_streamlit():
         print("You need to install Streamlit to run this functionality.")
 
 
-def is_running_in_databricks():
-    """ Check if the currently running Python Process is running in Databricks or not
-     If any Environment Variable name contains 'DATABRICKS' this will return True, otherwise False"""
-    for k in os.environ.keys():
-        if 'DATABRICKS' in k:
-            return True
-    return False
+def is_running_in_databricks_runtime():
+    """ Check if the currently running Python Process is running in Databricks runtime or not
+    """
+    return "DATABRICKS_RUNTIME_VERSION" in os.environ
 
 
 def install_and_import_package(pkg_name, version='', import_name=''):