You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I was training yolox-s on 8 2080 gpus with batch size 64. Each time at 10/300 epoch the issue happens.
The error log is as follows:
Exception in thread Thread-1:
Traceback (most recent call last):
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 752, in answer_challenge
message = connection.recv_bytes(256) # reject large message
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError
Exception in thread Thread-1:
Traceback (most recent call last):
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 509, in Client
deliver_challenge(c, authkey)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 740, in deliver_challenge
response = connection.recv_bytes(256) # reject large message
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining) ConnectionResetError: [Errno 104] Connection reset by peer
Exception in thread Thread-1:
Traceback (most recent call last):
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn: 2021-07-21 09:11:10 | ERROR | yolox.core.launch:104 - An error has been caught in function '_distributed_worker', process 'SpawnProcess-1' (28776), thread 'MainThread' (140335833473600):
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
│ │ └ 3
│ └ 37
└ <function _main at 0x7fa27a2465e0>
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/spawn.py", line 129, in _main
return self._bootstrap(parent_sentinel)
│ │ └ 3
│ └ <function BaseProcess._bootstrap at 0x7fa27a369820>
└ <SpawnProcess name='SpawnProcess-1' parent=28716 started>
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
│ └ <function BaseProcess.run at 0x7fa27a37fe50>
└ <SpawnProcess name='SpawnProcess-1' parent=28716 started>
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
│ │ │ │ │ └ {}
│ │ │ │ └ <SpawnProcess name='SpawnProcess-1' parent=28716 started>
│ │ │ └ (<function _distributed_worker at 0x7fa0130f5550>, 0, (<function main at 0x7fa00a620af0>, 8, 8, 0, 'nccl', 'tcp://127.0.0.1:4...
│ │ └ <SpawnProcess name='SpawnProcess-1' parent=28716 started>
│ └ <function _wrap at 0x7fa01731edc0>
└ <SpawnProcess name='SpawnProcess-1' parent=28716 started>
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
│ │ └ (<function main at 0x7fa00a620af0>, 8, 8, 0, 'nccl', 'tcp://127.0.0.1:48781', (╒══════════════════╤══════════════════════════...
│ └ 0
└ <function _distributed_worker at 0x7fa0130f5550>
> File "/home/hdzhang/YOLOX/yolox/core/launch.py", line 104, in _distributed_worker
main_func(*args)
│ └ (╒══════════════════╤════════════════════════════════════════════════════════════════════════════════════════════════════════...
└ <function main at 0x7fa00a620af0>
File "/home/hdzhang/YOLOX/tools/train.py", line 101, in main
trainer.train()
│ └ <function Trainer.train at 0x7fa01767d5e0>
└ <yolox.core.trainer.Trainer object at 0x7fa00a635d30>
File "/home/hdzhang/YOLOX/yolox/core/trainer.py", line 70, in train
self.train_in_epoch()
│ └ <function Trainer.train_in_epoch at 0x7fa00a8e8310>
└ <yolox.core.trainer.Trainer object at 0x7fa00a635d30>
File "/home/hdzhang/YOLOX/yolox/core/trainer.py", line 80, in train_in_epoch
self.after_epoch()
│ └ <function Trainer.after_epoch at 0x7fa00a6005e0>
└ <yolox.core.trainer.Trainer object at 0x7fa00a635d30>
File "/home/hdzhang/YOLOX/yolox/core/trainer.py", line 209, in after_epoch
all_reduce_norm(self.model)
│ │ └ DistributedDataParallel(
│ │ (module): YOLOX(
│ │ (backbone): YOLOPAFPN(
│ │ (backbone): CSPDarknet(
│ │ (stem): Focus(
│ │ ...
│ └ <yolox.core.trainer.Trainer object at 0x7fa00a635d30>
└ <function all_reduce_norm at 0x7fa016ef95e0>
File "/home/hdzhang/YOLOX/yolox/utils/allreduce_norm.py", line 99, in all_reduce_norm
states = all_reduce(states, op="mean")
│ └ OrderedDict([('module.backbone.backbone.stem.conv.bn.weight', tensor([1.4156, 2.5198, 2.6882, 1.5280, 3.4103, 2.3906, 2.5711,...
└ <function all_reduce at 0x7fa016ef9550>
File "/home/hdzhang/YOLOX/yolox/utils/allreduce_norm.py", line 68, in all_reduce
group = _get_global_gloo_group()
└ <functools._lru_cache_wrapper object at 0x7fa016ef9040>
File "/home/hdzhang/YOLOX/yolox/utils/dist.py", line 103, in _get_global_gloo_group
return dist.new_group(backend="gloo")
│ └ <function new_group at 0x7fa017800820>
└ <module 'torch.distributed' from '/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/distributed/__ini...
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 2694, in new_group
pg = _new_process_group_helper(group_world_size,
│ └ 8
└ <function _new_process_group_helper at 0x7fa0177ff1f0>
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 616, in _new_process_group_helper
pg = ProcessGroupGloo(
└ <class 'torch._C._distributed_c10d.ProcessGroupGloo'>
The text was updated successfully, but these errors were encountered:
Hi, thanks for your report and we also noticed this case. We are currently diagnosing the problem. And you can temporarily change the num of workers to 0 for the eval dataset to skip this error, but the testing speed may be slower.
@zhanghanduo
This error can be solved by reset the nccl and gloo environ to 'lo' in setup_env.py as follows:
And num_workers parameter is not necessary to be set as 0. According to my trial, this method will speed your training process!
@qbTrible's solution didn't work for me, I noticed setting configure_omp(x) to the same threads as data_num_workers did the trick. Specifically in this line
I was training yolox-s on 8 2080 gpus with batch size 64. Each time at 10/300 epoch the issue happens.
The error log is as follows:
Exception in thread Thread-1:
Traceback (most recent call last):
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 752, in answer_challenge
message = connection.recv_bytes(256) # reject large message
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 383, in _recv
raise EOFError
EOFError
Exception in thread Thread-1:
Traceback (most recent call last):
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 509, in Client
deliver_challenge(c, authkey)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 740, in deliver_challenge
response = connection.recv_bytes(256) # reject large message
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
Exception in thread Thread-1:
Traceback (most recent call last):
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/site-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/home/hdzhang/miniconda3/envs/centernet/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
2021-07-21 09:11:10 | ERROR | yolox.core.launch:104 - An error has been caught in function '_distributed_worker', process 'SpawnProcess-1' (28776), thread 'MainThread' (140335833473600):
Traceback (most recent call last):
The text was updated successfully, but these errors were encountered: