Monkeypatch bigquery and gcs at module load time to avoid always… (#590)

vimota · web-flow · commit db3252e33221 · 2019-07-18T15:11:25.000-07:00
* Monkeypatch bigquery and gcs at module load time to avoid always loading bigquery/gcs and all its dependencies.

* Nit fixes

* Change path checking method.
diff --git a/patches/kaggle_gcp.py b/patches/kaggle_gcp.py
@@ -110,3 +110,90 @@ def __init__(self, *args, **kwargs):
         )
         # TODO: Remove this once https://github.com/googleapis/google-cloud-python/issues/7122 is implemented.
         self._connection = _DataProxyConnection(self)
+
+
+def init_bigquery():
+    from google.auth import environment_vars
+    from google.cloud import bigquery
+
+    is_proxy_token_set = "KAGGLE_DATA_PROXY_TOKEN" in os.environ
+    is_user_secrets_token_set = "KAGGLE_USER_SECRETS_TOKEN" in os.environ
+    if not (is_proxy_token_set or is_user_secrets_token_set):
+        return bigquery
+
+    # If this Kernel has bigquery integration on startup, preload the Kaggle Credentials
+    # object for magics to work. 
+    if get_integrations().has_bigquery():
+        from google.cloud.bigquery import magics
+        magics.context.credentials = KaggleKernelCredentials()
+
+    def monkeypatch_bq(bq_client, *args, **kwargs):
+        from kaggle_gcp import get_integrations, PublicBigqueryClient, KaggleKernelCredentials
+        specified_credentials = kwargs.get('credentials')
+        has_bigquery = get_integrations().has_bigquery()
+        # Prioritize passed in project id, but if it is missing look for env var. 
+        arg_project = kwargs.get('project')
+        explicit_project_id = arg_project or os.environ.get(environment_vars.PROJECT)
+        # This is a hack to get around the bug in google-cloud library.
+        # Remove these two lines once this is resolved:
+        # https://github.com/googleapis/google-cloud-python/issues/8108
+        if explicit_project_id:
+            Log.info(f"Explicit project set to {explicit_project_id}")
+            kwargs['project'] = explicit_project_id
+        if explicit_project_id is None and specified_credentials is None and not has_bigquery:
+            msg = "Using Kaggle's public dataset BigQuery integration."
+            Log.info(msg)
+            print(msg)
+            return PublicBigqueryClient(*args, **kwargs)
+
+        else:
+            if specified_credentials is None:
+                Log.info("No credentials specified, using KaggleKernelCredentials.")
+                kwargs['credentials'] = KaggleKernelCredentials()
+                if (not has_bigquery):
+                    Log.info("No bigquery integration found, creating client anyways.")
+                    print('Please ensure you have selected a BigQuery '
+                        'account in the Kernels Settings sidebar.')
+            return bq_client(*args, **kwargs)
+
+    # Monkey patches BigQuery client creation to use proxy or user-connected GCP account.
+    # Deprecated in favor of Kaggle.DataProxyClient().
+    # TODO: Remove this once uses have migrated to that new interface.
+    bq_client = bigquery.Client
+    bigquery.Client = lambda *args, **kwargs:  monkeypatch_bq(
+        bq_client, *args, **kwargs)
+    return bigquery
+
+def init_gcs():
+    is_user_secrets_token_set = "KAGGLE_USER_SECRETS_TOKEN" in os.environ
+    from google.cloud import storage
+    if not is_user_secrets_token_set:
+        return storage
+    
+    from kaggle_gcp import get_integrations
+    if not get_integrations().has_gcs():
+        return storage
+    
+    from kaggle_secrets import GcpTarget
+    from kaggle_gcp import KaggleKernelCredentials
+    def monkeypatch_gcs(gcs_client, *args, **kwargs):
+        specified_credentials = kwargs.get('credentials')
+        if specified_credentials is None:
+            Log.info("No credentials specified, using KaggleKernelCredentials.")
+            kwargs['credentials'] = KaggleKernelCredentials(target=GcpTarget.GCS)
+        return gcs_client(*args, **kwargs)
+
+    gcs_client = storage.Client
+    storage.Client = lambda *args, **kwargs:  monkeypatch_gcs(gcs_client, *args, **kwargs)
+    return storage
+
+def init():
+    init_bigquery()
+    init_gcs()
+
+# We need to initialize the monkeypatching of the client libraries
+# here since there is a circular dependency between our import hook version
+# google.cloud.* and kaggle_gcp. By calling init here, we guarantee
+# that regardless of the original import that caused google.cloud.* to be
+# loaded, the monkeypatching will be done.
+init()
diff --git a/patches/sitecustomize.py b/patches/sitecustomize.py
@@ -2,81 +2,51 @@
 
 from log import Log
 
-kaggle_proxy_token = os.getenv("KAGGLE_DATA_PROXY_TOKEN")
-kernel_integrations_var = os.getenv("KAGGLE_KERNEL_INTEGRATIONS")
-
-def init():
-    is_jwe_set = "KAGGLE_USER_SECRETS_TOKEN" in os.environ
-    if kaggle_proxy_token or is_jwe_set:
-        init_bigquery()
-    if is_jwe_set:
-        from kaggle_gcp import get_integrations
-        if get_integrations().has_gcs():
-            init_gcs()
-
-
-def init_bigquery():
-    from google.auth import environment_vars
-    from google.cloud import bigquery
-    # TODO: Update this to the correct kaggle.gcp path once we no longer inject modules
-    # from the worker.
-    from kaggle_gcp import get_integrations, PublicBigqueryClient, KaggleKernelCredentials
-
-    # If this Kernel has bigquery integration on startup, preload the Kaggle Credentials
-    # object for magics to work. 
-    if get_integrations().has_bigquery():
-        from google.cloud.bigquery import magics
-        magics.context.credentials = KaggleKernelCredentials()
-
-    def monkeypatch_bq(bq_client, *args, **kwargs):
-        specified_credentials = kwargs.get('credentials')
-        has_bigquery = get_integrations().has_bigquery()
-        # Prioritize passed in project id, but if it is missing look for env var. 
-        arg_project = kwargs.get('project')
-        explicit_project_id = arg_project or os.environ.get(environment_vars.PROJECT)
-        # This is a hack to get around the bug in google-cloud library.
-        # Remove these two lines once this is resolved:
-        # https://github.com/googleapis/google-cloud-python/issues/8108
-        if explicit_project_id:
-            Log.info(f"Explicit project set to {explicit_project_id}")
-            kwargs['project'] = explicit_project_id
-        if explicit_project_id is None and specified_credentials is None and not has_bigquery:
-            msg = "Using Kaggle's public dataset BigQuery integration."
-            Log.info(msg)
-            print(msg)
-            return PublicBigqueryClient(*args, **kwargs)
-
-        else:
-            if specified_credentials is None:
-                Log.info("No credentials specified, using KaggleKernelCredentials.")
-                kwargs['credentials'] = KaggleKernelCredentials()
-                if (not has_bigquery):
-                    Log.info("No bigquery integration found, creating client anyways.")
-                    print('Please ensure you have selected a BigQuery '
-                        'account in the Kernels Settings sidebar.')
-            return bq_client(*args, **kwargs)
-
-    # Monkey patches BigQuery client creation to use proxy or user-connected GCP account.
-    # Deprecated in favor of Kaggle.DataProxyClient().
-    # TODO: Remove this once uses have migrated to that new interface.
-    bq_client = bigquery.Client
-    bigquery.Client = lambda *args, **kwargs:  monkeypatch_bq(
-        bq_client, *args, **kwargs)
-
-
-def init_gcs():
-   from kaggle_secrets import GcpTarget
-   from kaggle_gcp import KaggleKernelCredentials
-   from google.cloud import storage
-   def monkeypatch_gcs(gcs_client, *args, **kwargs):
-       specified_credentials = kwargs.get('credentials')
-       if specified_credentials is None:
-           Log.info("No credentials specified, using KaggleKernelCredentials.")
-           kwargs['credentials'] = KaggleKernelCredentials(target=GcpTarget.GCS)
-       return gcs_client(*args, **kwargs)
-
-   gcs_client = storage.Client
-   storage.Client = lambda *args, **kwargs:  monkeypatch_gcs(gcs_client, *args, **kwargs)
-
-
-init()
+import sys
+import importlib
+import importlib.machinery
+
+class GcpModuleFinder(importlib.abc.MetaPathFinder):
+    _MODULES = ['google.cloud.bigquery', 'google.cloud.storage']
+    _KAGGLE_GCP_PATH = 'kaggle_gcp.py'
+    def __init__(self):
+        pass
+    
+    def _is_called_from_kaggle_gcp(self):
+        import inspect
+        for frame in inspect.stack():
+            if os.path.basename(frame.filename) == self._KAGGLE_GCP_PATH:
+                return True
+        return False
+
+    def find_spec(self, fullname, path, target=None):
+        if fullname in self._MODULES:
+            # If being called from kaggle_gcp, don't return our
+            # monkeypatched module to avoid circular dependency,
+            # since we call kaggle_gcp to load the module.
+            if self._is_called_from_kaggle_gcp():
+                return None
+            return importlib.machinery.ModuleSpec(fullname, GcpModuleLoader())
+
+
+class GcpModuleLoader(importlib.abc.Loader):
+    def __init__(self):
+        pass
+
+    def create_module(self, spec):
+        """Create the gcp module from the spec.
+        """
+        import kaggle_gcp
+        _LOADERS = {
+            'google.cloud.bigquery': kaggle_gcp.init_bigquery,
+            'google.cloud.storage': kaggle_gcp.init_gcs
+        }
+        monkeypatch_gcp_module = _LOADERS[spec.name]()
+        return monkeypatch_gcp_module
+
+    def exec_module(self, module):
+        pass
+
+
+if not hasattr(sys, 'frozen'):
+    sys.meta_path.insert(0, GcpModuleFinder())
diff --git a/tests/test_bigquery.py b/tests/test_bigquery.py
@@ -11,7 +11,7 @@
 from google.cloud import bigquery
 from google.auth.exceptions import DefaultCredentialsError
 from google.cloud.bigquery._http import Connection
-from kaggle_gcp import KaggleKernelCredentials, PublicBigqueryClient, _DataProxyConnection
+from kaggle_gcp import KaggleKernelCredentials, PublicBigqueryClient, _DataProxyConnection, init_bigquery
 import kaggle_secrets
 
 
@@ -149,8 +149,7 @@ def test_magics_with_connected_account_default_credentials(self):
         env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
         env.set('KAGGLE_KERNEL_INTEGRATIONS', 'BIGQUERY')
         with env:
-            import sitecustomize
-            sitecustomize.init()
+            init_bigquery()
             from google.cloud.bigquery import magics
             self.assertEqual(type(magics.context._credentials), KaggleKernelCredentials)
             magics.context.credentials = None
@@ -159,7 +158,6 @@ def test_magics_without_connected_account(self):
         env = EnvironmentVarGuard()
         env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
         with env:
-            import sitecustomize
-            sitecustomize.init()
+            init_bigquery()
             from google.cloud.bigquery import magics
             self.assertIsNone(magics.context._credentials)
diff --git a/tests/test_gcs.py b/tests/test_gcs.py
@@ -2,7 +2,7 @@
 
 from unittest.mock import Mock
 
-from kaggle_gcp import KaggleKernelCredentials
+from kaggle_gcp import KaggleKernelCredentials, init_gcs
 from test.support import EnvironmentVarGuard
 from google.cloud import storage
 
@@ -21,8 +21,7 @@ def test_ctr(self):
         env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
         env.set('KAGGLE_KERNEL_INTEGRATIONS', 'GCS')
         with env:
-            from sitecustomize import init
-            init()
+            init_gcs()
             client = storage.Client(project="xyz", credentials=credentials)
             self.assertEqual(client.project, "xyz")
             self.assertNotIsInstance(client._credentials, KaggleKernelCredentials)
@@ -38,7 +37,6 @@ def test_default_credentials_gcs_enabled(self):
         env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
         env.set('KAGGLE_KERNEL_INTEGRATIONS', 'GCS')
         with env:
-            from sitecustomize import init
-            init()
+            init_gcs()
             client = storage.Client(project="xyz")
             self.assertIsInstance(client._credentials, KaggleKernelCredentials)