Skip to content

Commit db3252e

Browse files
authored
Monkeypatch bigquery and gcs at module load time to avoid always… (#590)
* Monkeypatch bigquery and gcs at module load time to avoid always loading bigquery/gcs and all its dependencies. * Nit fixes * Change path checking method.
1 parent 6a11b97 commit db3252e

File tree

4 files changed

+141
-88
lines changed

4 files changed

+141
-88
lines changed

patches/kaggle_gcp.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,90 @@ def __init__(self, *args, **kwargs):
110110
)
111111
# TODO: Remove this once https://github.com/googleapis/google-cloud-python/issues/7122 is implemented.
112112
self._connection = _DataProxyConnection(self)
113+
114+
115+
def init_bigquery():
116+
from google.auth import environment_vars
117+
from google.cloud import bigquery
118+
119+
is_proxy_token_set = "KAGGLE_DATA_PROXY_TOKEN" in os.environ
120+
is_user_secrets_token_set = "KAGGLE_USER_SECRETS_TOKEN" in os.environ
121+
if not (is_proxy_token_set or is_user_secrets_token_set):
122+
return bigquery
123+
124+
# If this Kernel has bigquery integration on startup, preload the Kaggle Credentials
125+
# object for magics to work.
126+
if get_integrations().has_bigquery():
127+
from google.cloud.bigquery import magics
128+
magics.context.credentials = KaggleKernelCredentials()
129+
130+
def monkeypatch_bq(bq_client, *args, **kwargs):
131+
from kaggle_gcp import get_integrations, PublicBigqueryClient, KaggleKernelCredentials
132+
specified_credentials = kwargs.get('credentials')
133+
has_bigquery = get_integrations().has_bigquery()
134+
# Prioritize passed in project id, but if it is missing look for env var.
135+
arg_project = kwargs.get('project')
136+
explicit_project_id = arg_project or os.environ.get(environment_vars.PROJECT)
137+
# This is a hack to get around the bug in google-cloud library.
138+
# Remove these two lines once this is resolved:
139+
# https://github.com/googleapis/google-cloud-python/issues/8108
140+
if explicit_project_id:
141+
Log.info(f"Explicit project set to {explicit_project_id}")
142+
kwargs['project'] = explicit_project_id
143+
if explicit_project_id is None and specified_credentials is None and not has_bigquery:
144+
msg = "Using Kaggle's public dataset BigQuery integration."
145+
Log.info(msg)
146+
print(msg)
147+
return PublicBigqueryClient(*args, **kwargs)
148+
149+
else:
150+
if specified_credentials is None:
151+
Log.info("No credentials specified, using KaggleKernelCredentials.")
152+
kwargs['credentials'] = KaggleKernelCredentials()
153+
if (not has_bigquery):
154+
Log.info("No bigquery integration found, creating client anyways.")
155+
print('Please ensure you have selected a BigQuery '
156+
'account in the Kernels Settings sidebar.')
157+
return bq_client(*args, **kwargs)
158+
159+
# Monkey patches BigQuery client creation to use proxy or user-connected GCP account.
160+
# Deprecated in favor of Kaggle.DataProxyClient().
161+
# TODO: Remove this once uses have migrated to that new interface.
162+
bq_client = bigquery.Client
163+
bigquery.Client = lambda *args, **kwargs: monkeypatch_bq(
164+
bq_client, *args, **kwargs)
165+
return bigquery
166+
167+
def init_gcs():
168+
is_user_secrets_token_set = "KAGGLE_USER_SECRETS_TOKEN" in os.environ
169+
from google.cloud import storage
170+
if not is_user_secrets_token_set:
171+
return storage
172+
173+
from kaggle_gcp import get_integrations
174+
if not get_integrations().has_gcs():
175+
return storage
176+
177+
from kaggle_secrets import GcpTarget
178+
from kaggle_gcp import KaggleKernelCredentials
179+
def monkeypatch_gcs(gcs_client, *args, **kwargs):
180+
specified_credentials = kwargs.get('credentials')
181+
if specified_credentials is None:
182+
Log.info("No credentials specified, using KaggleKernelCredentials.")
183+
kwargs['credentials'] = KaggleKernelCredentials(target=GcpTarget.GCS)
184+
return gcs_client(*args, **kwargs)
185+
186+
gcs_client = storage.Client
187+
storage.Client = lambda *args, **kwargs: monkeypatch_gcs(gcs_client, *args, **kwargs)
188+
return storage
189+
190+
def init():
191+
init_bigquery()
192+
init_gcs()
193+
194+
# We need to initialize the monkeypatching of the client libraries
195+
# here since there is a circular dependency between our import hook version
196+
# google.cloud.* and kaggle_gcp. By calling init here, we guarantee
197+
# that regardless of the original import that caused google.cloud.* to be
198+
# loaded, the monkeypatching will be done.
199+
init()

patches/sitecustomize.py

Lines changed: 48 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -2,81 +2,51 @@
22

33
from log import Log
44

5-
kaggle_proxy_token = os.getenv("KAGGLE_DATA_PROXY_TOKEN")
6-
kernel_integrations_var = os.getenv("KAGGLE_KERNEL_INTEGRATIONS")
7-
8-
def init():
9-
is_jwe_set = "KAGGLE_USER_SECRETS_TOKEN" in os.environ
10-
if kaggle_proxy_token or is_jwe_set:
11-
init_bigquery()
12-
if is_jwe_set:
13-
from kaggle_gcp import get_integrations
14-
if get_integrations().has_gcs():
15-
init_gcs()
16-
17-
18-
def init_bigquery():
19-
from google.auth import environment_vars
20-
from google.cloud import bigquery
21-
# TODO: Update this to the correct kaggle.gcp path once we no longer inject modules
22-
# from the worker.
23-
from kaggle_gcp import get_integrations, PublicBigqueryClient, KaggleKernelCredentials
24-
25-
# If this Kernel has bigquery integration on startup, preload the Kaggle Credentials
26-
# object for magics to work.
27-
if get_integrations().has_bigquery():
28-
from google.cloud.bigquery import magics
29-
magics.context.credentials = KaggleKernelCredentials()
30-
31-
def monkeypatch_bq(bq_client, *args, **kwargs):
32-
specified_credentials = kwargs.get('credentials')
33-
has_bigquery = get_integrations().has_bigquery()
34-
# Prioritize passed in project id, but if it is missing look for env var.
35-
arg_project = kwargs.get('project')
36-
explicit_project_id = arg_project or os.environ.get(environment_vars.PROJECT)
37-
# This is a hack to get around the bug in google-cloud library.
38-
# Remove these two lines once this is resolved:
39-
# https://github.com/googleapis/google-cloud-python/issues/8108
40-
if explicit_project_id:
41-
Log.info(f"Explicit project set to {explicit_project_id}")
42-
kwargs['project'] = explicit_project_id
43-
if explicit_project_id is None and specified_credentials is None and not has_bigquery:
44-
msg = "Using Kaggle's public dataset BigQuery integration."
45-
Log.info(msg)
46-
print(msg)
47-
return PublicBigqueryClient(*args, **kwargs)
48-
49-
else:
50-
if specified_credentials is None:
51-
Log.info("No credentials specified, using KaggleKernelCredentials.")
52-
kwargs['credentials'] = KaggleKernelCredentials()
53-
if (not has_bigquery):
54-
Log.info("No bigquery integration found, creating client anyways.")
55-
print('Please ensure you have selected a BigQuery '
56-
'account in the Kernels Settings sidebar.')
57-
return bq_client(*args, **kwargs)
58-
59-
# Monkey patches BigQuery client creation to use proxy or user-connected GCP account.
60-
# Deprecated in favor of Kaggle.DataProxyClient().
61-
# TODO: Remove this once uses have migrated to that new interface.
62-
bq_client = bigquery.Client
63-
bigquery.Client = lambda *args, **kwargs: monkeypatch_bq(
64-
bq_client, *args, **kwargs)
65-
66-
67-
def init_gcs():
68-
from kaggle_secrets import GcpTarget
69-
from kaggle_gcp import KaggleKernelCredentials
70-
from google.cloud import storage
71-
def monkeypatch_gcs(gcs_client, *args, **kwargs):
72-
specified_credentials = kwargs.get('credentials')
73-
if specified_credentials is None:
74-
Log.info("No credentials specified, using KaggleKernelCredentials.")
75-
kwargs['credentials'] = KaggleKernelCredentials(target=GcpTarget.GCS)
76-
return gcs_client(*args, **kwargs)
77-
78-
gcs_client = storage.Client
79-
storage.Client = lambda *args, **kwargs: monkeypatch_gcs(gcs_client, *args, **kwargs)
80-
81-
82-
init()
5+
import sys
6+
import importlib
7+
import importlib.machinery
8+
9+
class GcpModuleFinder(importlib.abc.MetaPathFinder):
10+
_MODULES = ['google.cloud.bigquery', 'google.cloud.storage']
11+
_KAGGLE_GCP_PATH = 'kaggle_gcp.py'
12+
def __init__(self):
13+
pass
14+
15+
def _is_called_from_kaggle_gcp(self):
16+
import inspect
17+
for frame in inspect.stack():
18+
if os.path.basename(frame.filename) == self._KAGGLE_GCP_PATH:
19+
return True
20+
return False
21+
22+
def find_spec(self, fullname, path, target=None):
23+
if fullname in self._MODULES:
24+
# If being called from kaggle_gcp, don't return our
25+
# monkeypatched module to avoid circular dependency,
26+
# since we call kaggle_gcp to load the module.
27+
if self._is_called_from_kaggle_gcp():
28+
return None
29+
return importlib.machinery.ModuleSpec(fullname, GcpModuleLoader())
30+
31+
32+
class GcpModuleLoader(importlib.abc.Loader):
33+
def __init__(self):
34+
pass
35+
36+
def create_module(self, spec):
37+
"""Create the gcp module from the spec.
38+
"""
39+
import kaggle_gcp
40+
_LOADERS = {
41+
'google.cloud.bigquery': kaggle_gcp.init_bigquery,
42+
'google.cloud.storage': kaggle_gcp.init_gcs
43+
}
44+
monkeypatch_gcp_module = _LOADERS[spec.name]()
45+
return monkeypatch_gcp_module
46+
47+
def exec_module(self, module):
48+
pass
49+
50+
51+
if not hasattr(sys, 'frozen'):
52+
sys.meta_path.insert(0, GcpModuleFinder())

tests/test_bigquery.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from google.cloud import bigquery
1212
from google.auth.exceptions import DefaultCredentialsError
1313
from google.cloud.bigquery._http import Connection
14-
from kaggle_gcp import KaggleKernelCredentials, PublicBigqueryClient, _DataProxyConnection
14+
from kaggle_gcp import KaggleKernelCredentials, PublicBigqueryClient, _DataProxyConnection, init_bigquery
1515
import kaggle_secrets
1616

1717

@@ -149,8 +149,7 @@ def test_magics_with_connected_account_default_credentials(self):
149149
env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
150150
env.set('KAGGLE_KERNEL_INTEGRATIONS', 'BIGQUERY')
151151
with env:
152-
import sitecustomize
153-
sitecustomize.init()
152+
init_bigquery()
154153
from google.cloud.bigquery import magics
155154
self.assertEqual(type(magics.context._credentials), KaggleKernelCredentials)
156155
magics.context.credentials = None
@@ -159,7 +158,6 @@ def test_magics_without_connected_account(self):
159158
env = EnvironmentVarGuard()
160159
env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
161160
with env:
162-
import sitecustomize
163-
sitecustomize.init()
161+
init_bigquery()
164162
from google.cloud.bigquery import magics
165163
self.assertIsNone(magics.context._credentials)

tests/test_gcs.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from unittest.mock import Mock
44

5-
from kaggle_gcp import KaggleKernelCredentials
5+
from kaggle_gcp import KaggleKernelCredentials, init_gcs
66
from test.support import EnvironmentVarGuard
77
from google.cloud import storage
88

@@ -21,8 +21,7 @@ def test_ctr(self):
2121
env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
2222
env.set('KAGGLE_KERNEL_INTEGRATIONS', 'GCS')
2323
with env:
24-
from sitecustomize import init
25-
init()
24+
init_gcs()
2625
client = storage.Client(project="xyz", credentials=credentials)
2726
self.assertEqual(client.project, "xyz")
2827
self.assertNotIsInstance(client._credentials, KaggleKernelCredentials)
@@ -38,7 +37,6 @@ def test_default_credentials_gcs_enabled(self):
3837
env.set('KAGGLE_USER_SECRETS_TOKEN', 'foobar')
3938
env.set('KAGGLE_KERNEL_INTEGRATIONS', 'GCS')
4039
with env:
41-
from sitecustomize import init
42-
init()
40+
init_gcs()
4341
client = storage.Client(project="xyz")
4442
self.assertIsInstance(client._credentials, KaggleKernelCredentials)

0 commit comments

Comments
 (0)