Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

InvalidArgumentError: device CUDA:0 not supported by XLA service while setting up XLA_GPU_JIT device number 0 #423

Closed
emtropyml opened this issue Nov 7, 2019 · 5 comments

Comments

@emtropyml
Copy link

Inference: 0%| | 0/2 [00:04<?, ?it/s]


InvalidArgumentError Traceback (most recent call last)
in
----> 1 calculate_absa(pos_review, neg_review)

in calculate_absa(pos_review, neg_review)
16
17 def calculate_absa(pos_review, neg_review):
---> 18 comp = model.predict([pos_review, neg_review], threshold=0.5)
19 pos_comp = comp[0]
20 neg_comp = comp[1]

~/.local/lib/python3.5/site-packages/finetune/target_models/multi_label_classifier.py in predict(self, X, threshold)
49 self.config._threshold = threshold or self.config.multi_label_threshold
50 all_labels = []
---> 51 for _, start_of_doc, end_of_doc, _, proba in self.process_long_sequence(X):
52 if start_of_doc:
53 # if this is the first chunk in a document, start accumulating from scratch

~/.local/lib/python3.5/site-packages/finetune/base.py in process_long_sequence(self, X)
810
811 labels, batch_probas = [], []
--> 812 for pred in self._inference(X, predict_keys=[PredictMode.PROBAS, PredictMode.NORMAL], n_examples=len(flat_array_encoded)):
813 normal_pred = pred[PredictMode.NORMAL]
814 if not hasattr(self, 'multi_label'):

~/.local/lib/python3.5/site-packages/finetune/base.py in _inference(self, Xs, predict_keys, n_examples)
425 if self._cached_predict:
426 return self._cached_inference(
--> 427 Xs=Xs, predict_keys=predict_keys, n_examples=n_examples
428 )
429 else:

~/.local/lib/python3.5/site-packages/finetune/base.py in _cached_inference(self, Xs, predict_keys, n_examples)
409 predictions = [None] * n
410 for i in tqdm.tqdm(range(n), total=n, desc="Inference"):
--> 411 y = next(self._predictions)
412 try:
413 y = y[predict_keys[0]] if len(predict_keys) == 1 else y

~/.local/lib/python3.5/site-packages/tensorflow_estimator/python/estimator/estimator.py in predict(self, input_fn, predict_keys, hooks, checkpoint_path, yield_single_examples)
633 scaffold=estimator_spec.scaffold,
634 config=self._session_config),
--> 635 hooks=all_hooks) as mon_sess:
636 while not mon_sess.should_stop():
637 preds_evaluated = mon_sess.run(predictions)

~/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py in init(self, session_creator, hooks, stop_grace_period_secs)
1005 hooks,
1006 should_recover=True,
-> 1007 stop_grace_period_secs=stop_grace_period_secs)
1008
1009

~/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py in init(self, session_creator, hooks, should_recover, stop_grace_period_secs)
723 stop_grace_period_secs=stop_grace_period_secs)
724 if should_recover:
--> 725 self._sess = _RecoverableSession(self._coordinated_creator)
726 else:
727 self._sess = self._coordinated_creator.create_session()

~/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py in init(self, sess_creator)
1198 """
1199 self._sess_creator = sess_creator
-> 1200 _WrappedSession.init(self, self._create_session())
1201
1202 def _create_session(self):

~/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py in _create_session(self)
1203 while True:
1204 try:
-> 1205 return self._sess_creator.create_session()
1206 except _PREEMPTION_ERRORS as e:
1207 logging.info(

~/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py in create_session(self)
869 """Creates a coordinated session."""
870 # Keep the tf_sess for unit testing.
--> 871 self.tf_sess = self._session_creator.create_session()
872 # We don't want coordinator to suppress any exception.
873 self.coord = coordinator.Coordinator(clean_stop_exception_types=[])

~/.local/lib/python3.5/site-packages/tensorflow/python/training/monitored_session.py in create_session(self)
645 init_op=self._scaffold.init_op,
646 init_feed_dict=self._scaffold.init_feed_dict,
--> 647 init_fn=self._scaffold.init_fn)
648
649

~/.local/lib/python3.5/site-packages/tensorflow/python/training/session_manager.py in prepare_session(self, master, init_op, saver, checkpoint_dir, checkpoint_filename_with_path, wait_for_checkpoint, max_wait_secs, config, init_feed_dict, init_fn)
288 wait_for_checkpoint=wait_for_checkpoint,
289 max_wait_secs=max_wait_secs,
--> 290 config=config)
291 if not is_loaded_from_checkpoint:
292 if init_op is None and not init_fn and self._local_init_op is None:

~/.local/lib/python3.5/site-packages/tensorflow/python/training/session_manager.py in _restore_checkpoint(self, master, saver, checkpoint_dir, checkpoint_filename_with_path, wait_for_checkpoint, max_wait_secs, config)
192 strategy.extended._experimental_initialize_system() # pylint: disable=protected-access
193
--> 194 sess = session.Session(self._target, graph=self._graph, config=config)
195 if checkpoint_dir and checkpoint_filename_with_path:
196 raise ValueError("Can not provide both checkpoint_dir and "

~/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py in init(self, target, graph, config)
1568
1569 """
-> 1570 super(Session, self).init(target, graph, config=config)
1571 # NOTE(mrry): Create these on first __enter__ to avoid a reference cycle.
1572 self._default_graph_context_manager = None

~/.local/lib/python3.5/site-packages/tensorflow/python/client/session.py in init(self, target, graph, config)
691 try:
692 # pylint: disable=protected-access
--> 693 self._session = tf_session.TF_NewSessionRef(self._graph._c_graph, opts)
694 # pylint: enable=protected-access
695 finally:

InvalidArgumentError: device CUDA:0 not supported by XLA service
while setting up XLA_GPU_JIT device number 0

@benleetownsend
Copy link
Contributor

benleetownsend commented Nov 7, 2019

Can you send over the output of pip freeze | grep "tensorflow\|finetune" and provide a minimum reproducible example if possible. We suspect this is a GPU memory error in disguise, but if you are able to reproduce this then there is probably more too it.

@emtropyml
Copy link
Author

emtropyml commented Nov 7, 2019

finetune==0.8.4
mesh-tensorflow==0.0.5
tensorflow==1.14.0
tensorflow-datasets==1.0.2
tensorflow-estimator==1.14.0
tensorflow-gpu==1.14.0
tensorflow-hub==0.5.0
tensorflow-metadata==0.13.0
tensorflow-probability==0.7.0rc0
tensorflow-serving-api-gpu==1.14.0

The code just loads a 'DistilBERT.model' file and then executes model.predict() function. (It was working fine with 'GPT2.model' file and Finetune 0.8.3 previously)

@madisonmay
Copy link
Contributor

This happens consistently? And you can confirm your GPU isn't running something else when you kick off the script (by inspecting output of nvidia-smi)?

@emtropyml
Copy link
Author

It was indeed a GPU memory error. Resolved now. Thanks!

@ShangWeiKuo
Copy link

Excuse me @emtropyml
How did you do to resolve the InvalidArgumentError problem ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants