You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When I use multi-gpu training(num_gpus>=2), error is "RuntimeError: all tensors must be on devices[0] ". Please help me if you guys can fix it!
Traceback (most recent call last):
File "tools/train.py", line 98, in
Traceback (most recent call last):
File "tools/train.py", line 98, in
main()
File "tools/train.py", line 94, in main
logger=logger)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/mmdet-0.6rc0+3493751-py3.6.egg/mmdet/apis/train.py", line 57, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/mmdet-0.6rc0+3493751-py3.6.egg/mmdet/apis/train.py", line 78, in _dist_train
main()
File "tools/train.py", line 94, in main
model = MMDistributedDataParallel(model.cuda())
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 217, in init
logger=logger)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/mmdet-0.6rc0+3493751-py3.6.egg/mmdet/apis/train.py", line 57, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/mmdet-0.6rc0+3493751-py3.6.egg/mmdet/apis/train.py", line 78, in _dist_train
self._ddp_init_helper()
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 232, in _ddp_init_helper
model = MMDistributedDataParallel(model.cuda())
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 217, in init
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 13, in replicate
param_copies = Broadcast.apply(devices, *params)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 21, in forward
self._ddp_init_helper()
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 232, in _ddp_init_helper
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/cuda/comm.py", line 40, in broadcast_coalesced
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 13, in replicate
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: all tensors must be on devices[0]
param_copies = Broadcast.apply(devices, *params)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 21, in forward
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/cuda/comm.py", line 40, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: all tensors must be on devices[0]
The text was updated successfully, but these errors were encountered:
When I use multi-gpu training(num_gpus>=2), error is "RuntimeError: all tensors must be on devices[0] ". Please help me if you guys can fix it!
Traceback (most recent call last):
File "tools/train.py", line 98, in
Traceback (most recent call last):
File "tools/train.py", line 98, in
main()
File "tools/train.py", line 94, in main
logger=logger)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/mmdet-0.6rc0+3493751-py3.6.egg/mmdet/apis/train.py", line 57, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/mmdet-0.6rc0+3493751-py3.6.egg/mmdet/apis/train.py", line 78, in _dist_train
main()
File "tools/train.py", line 94, in main
model = MMDistributedDataParallel(model.cuda())
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 217, in init
logger=logger)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/mmdet-0.6rc0+3493751-py3.6.egg/mmdet/apis/train.py", line 57, in train_detector
_dist_train(model, dataset, cfg, validate=validate)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/mmdet-0.6rc0+3493751-py3.6.egg/mmdet/apis/train.py", line 78, in _dist_train
self._ddp_init_helper()
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 232, in _ddp_init_helper
model = MMDistributedDataParallel(model.cuda())
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 217, in init
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 13, in replicate
param_copies = Broadcast.apply(devices, *params)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 21, in forward
self._ddp_init_helper()
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 232, in _ddp_init_helper
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/cuda/comm.py", line 40, in broadcast_coalesced
self._module_copies = replicate(self.module, self.device_ids, detach=True)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 13, in replicate
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: all tensors must be on devices[0]
param_copies = Broadcast.apply(devices, *params)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 21, in forward
outputs = comm.broadcast_coalesced(inputs, ctx.target_gpus)
File "/home/by/APP/anaconda2/envs/hrnet-det/lib/python3.6/site-packages/torch/cuda/comm.py", line 40, in broadcast_coalesced
return torch._C._broadcast_coalesced(tensors, devices, buffer_size)
RuntimeError: all tensors must be on devices[0]
The text was updated successfully, but these errors were encountered: