Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Runtime Error when val #13

Closed
Chic-J opened this issue Jul 25, 2021 · 10 comments
Closed

Runtime Error when val #13

Chic-J opened this issue Jul 25, 2021 · 10 comments

Comments

@Chic-J
Copy link

Chic-J commented Jul 25, 2021

thanks for your work.
but i found an error when i try to test the code in VOC
`Checkpoint <E:\Context-Aware-Consistency-master\pretrained\voc_1over8_datalist0_deeplabv3+_resnet101.pth> (epoch 63) was loaded

EVALUATION

0%| | 0/724 [00:17<?, ?it/s]
Traceback (most recent call last):
File "D:\ProgramData\Anaconda3\envs\cv\lib\runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "D:\ProgramData\Anaconda3\envs\cv\lib\runpy.py", line 85, in run_code
exec(code, run_globals)
File "c:\Users\Administrator.vscode\extensions\ms-python.python-2020.7.96456\pythonFiles\lib\python\debugpy_main
.py", line 45, in
cli.main()
File "c:\Users\Administrator.vscode\extensions\ms-python.python-2020.7.96456\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py", line 430, in main
run()
File "c:\Users\Administrator.vscode\extensions\ms-python.python-2020.7.96456\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py", line 267, in run_file
runpy.run_path(options.target, run_name=compat.force_str("main"))
File "D:\ProgramData\Anaconda3\envs\cv\lib\runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "D:\ProgramData\Anaconda3\envs\cv\lib\runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "D:\ProgramData\Anaconda3\envs\cv\lib\runpy.py", line 85, in _run_code
exec(code, run_globals)
File "e:\Context-Aware-Consistency-master\train.py", line 128, in
main(config['n_gpu'], config['n_gpu'], config, args.resume, args.test)
File "e:\Context-Aware-Consistency-master\train.py", line 99, in main
trainer.train()
File "e:\Context-Aware-Consistency-master\base\base_trainer.py", line 105, in train
results = self._valid_epoch(0)
File "e:\Context-Aware-Consistency-master\trainer.py", line 145, in _valid_epoch
for batch_idx, (data, target) in enumerate(tbar):
File "D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\tqdm\std.py", line 1185, in iter
for obj in iterable:
File "D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data\dataloader.py", line 435, in next
data = self._next_data()
File "D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data\dataloader.py", line 475, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data_utils\fetch.py", line 47, in fetch
return self.collate_fn(data)
File "D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data_utils\collate.py", line 83, in default_collate
return [default_collate(samples) for samples in transposed]
File "D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data_utils\collate.py", line 83, in
return [default_collate(samples) for samples in transposed]
File "D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data_utils\collate.py", line 55, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [3, 375, 500] at entry 0 and [3, 396, 500] at entry 1`

@Chic-J
Copy link
Author

Chic-J commented Jul 25, 2021

I use a single GPU, and the config like:
{
"name": "CAC",
"experim_name": "voc_cac_deeplabv3+_resnet50_1over8_datalist0",
"dataset": "voc",
"data_dir": "E:\dataset\VOCtrainval_11-May-2012",
"datalist": 0,
"n_gpu": 1,
"n_labeled_examples": 1323,
"diff_lrs": true,
"ramp_up": 0.1,
"unsupervised_w": 30,
"ignore_index": 255,
"lr_scheduler": "Poly",
"use_weak_lables":false,
"weakly_loss_w": 0.4,
"pretrained": true,
"random_seed": 42,

"model":{
    "supervised": false,
    "semi": true,
    "supervised_w": 1,

    "sup_loss": "CE",

    "layers": 50,
    "downsample": true,
    "proj_final_dim": 128,
    "out_dim": 256,
    "backbone": "deeplab_v3+",
    "pos_thresh_value": 0.75,
    "weight_unsup": 0.1,
    "epoch_start_unsup": 5,
    "selected_num": 6400,
    "temp": 0.1,
    "step_save": 1,
    "stride": 8
},


"optimizer": {
    "type": "SGD",
    "args":{
        "lr": 0.01,
        "weight_decay": 1e-4,
        "momentum": 0.9
    }
},

"train_supervised": {
    "batch_size": 8,
    "crop_size": 320,
    "shuffle": true,
    "base_size": 400,
    "scale": true,
    "augment": true,
    "flip": true,
    "rotate": false,
    "blur": false,
    "split": "train_supervised",
    "num_workers": 0
},

"train_unsupervised": {
    "batch_size": 8,
    "crop_size": 320,
    "shuffle": true,
    "base_size": 400,
    "scale": true,
    "augment": true,
    "flip": true,
    "rotate": false,
    "blur": false,
    "split": "train_unsupervised",
    "num_workers": 0,
    "iou_bound": [0.1, 1.0],
    "stride": 8
},

"val_loader": {
    "batch_size": 2,
    "val": true,
    "split": "val",
    "shuffle": false,
    "num_workers": 0
},

"trainer": {
    "epochs": 80,
    "save_dir": "saved/",
    "save_period": 1,

    "monitor": "max Mean_IoU",
    "early_stop": 100,
    
    "tensorboardX": true,
    "log_dir": "saved/",
    "log_per_iter": 20,

    "val": true,
    "val_per_epochs": 1
}

}

@Chic-J
Copy link
Author

Chic-J commented Jul 25, 2021

i solve with batch_size=1, anyway, is there another way?
more, i test in VOC with pretrained model but only get result likes:
Checkpoint <E:\Context-Aware-Consistency-master\pretrained\voc_1over8_datalist0_deeplabv3+_resnet101.pth> (epoch 63) was loaded

EVALUATION

EVAL (0) | Loss: 2.823, PixelAcc: 0.13, Mean IoU: 0.01 |: 100%|███████████████████████████████| 1449/1449 [03:20<00:00, 7.23it/s]

     val_loss       : 2.82271
     Pixel_Accuracy : 0.129
     Mean_IoU       : 0.014000000432133675
     Class_IoU      : {0: 0.126, 1: 0.0, 2: 0.0, 3: 0.002, 4: 0.0, 5: 0.003, 6: 0.005, 7: 0.016, 8: 0.026, 9: 0.009, 10: 0.004, 11: 0.009, 12: 0.0, 13: 0.005, 14: 0.0, 15: 0.059, 16: 0.0, 17: 0.0, 18: 0.0, 19: 0.026, 20: 0.0}

i annotated the some code deal with distribute training, do they affect the result?the annotated codes are:
# if distributed:
# dist.all_reduce(pixel_labeled), dist.all_reduce(pixel_correct)
# dist.all_reduce(pixel_labeled), dist.all_reduce(pixel_correct)

# if distributed:
#     dist.all_reduce(area_inter), dist.all_reduce(area_union)

# dist.all_reduce(area_inter), dist.all_reduce(area_union)

they are all in metrics.py.

@Chic-J
Copy link
Author

Chic-J commented Jul 26, 2021

Another question.
the RAM is oom when i train...
is there memory leak? or why the RAM is getting higher?
THANKS FOR YOUR ANSWER.

@X-Lai
Copy link
Collaborator

X-Lai commented Jul 28, 2021

Thanks for your interest in our work.

I think there may be something wrong on your side. It works well on my side. I just tested the model you mentioned. The log is shown as follows.

2021-07-28 03:39:28,212 - Trainer - INFO - config: {'name': 'CAC', 'experim_name': 'voc_cac_deeplabv3+_resnet101_1over8_datalist0', 'dataset': 'voc', 'data_dir': '/home/xinlai/VOCtrainval_11-May-2012', 'datalist': 0, 'n_gpu': 2, 'n_labeled_examples': 1323, 'diff_lrs': True, 'ramp_up': 0.1, 'unsupervised_w': 30, 'ignore_index': 255, 'lr_scheduler': 'Poly', 'use_weak_lables': False, 'weakly_loss_w': 0.4, 'pretrained': True, 'random_seed': 42, 'model': {'supervised': False, 'semi': True, 'supervised_w': 1, 'sup_loss': 'CE', 'layers': 101, 'downsample': True, 'proj_final_dim': 128, 'out_dim': 256, 'backbone': 'deeplab_v3+', 'pos_thresh_value': 0.75, 'weight_unsup': 0.1, 'epoch_start_unsup': 5, 'selected_num': 6400, 'temp': 0.1, 'step_save': 2, 'stride': 8}, 'optimizer': {'type': 'SGD', 'args': {'lr': 0.01, 'weight_decay': 0.0001, 'momentum': 0.9}}, 'train_supervised': {'crop_size': 320, 'base_size': 400, 'scale': True, 'augment': True, 'flip': True, 'rotate': False, 'blur': False, 'split': 'train_supervised', 'n_labeled_examples': 1323, 'data_dir': '/home/xinlai/VOCtrainval_11-May-2012', 'datalist': 0, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'ignore_index': 255}, 'train_unsupervised': {'crop_size': 320, 'base_size': 400, 'scale': True, 'augment': True, 'flip': True, 'rotate': False, 'blur': False, 'split': 'train_unsupervised', 'iou_bound': [0.1, 1.0], 'stride': 8, 'n_labeled_examples': 1323, 'use_weak_lables': False, 'data_dir': '/home/xinlai/VOCtrainval_11-May-2012', 'datalist': 0, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'ignore_index': 255}, 'val_loader': {'val': True, 'split': 'val', 'data_dir': '/home/xinlai/VOCtrainval_11-May-2012', 'datalist': 0, 'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225], 'ignore_index': 255}, 'trainer': {'epochs': 80, 'save_dir': 'saved/', 'save_period': 1, 'monitor': 'max Mean_IoU', 'early_stop': 100, 'tensorboardX': True, 'log_dir': 'saved/', 'log_per_iter': 20, 'val': True, 'val_per_epochs': 1}, 'dist_url': 'tcp://127.0.0.1:39798', 'n_node': 0, 'world_size': 2, 'rank': 0}
2021-07-28 03:39:33,007 - Trainer - INFO - Loading checkpoint : voc_1over8_datalist0_deeplabv3+_resnet101.pth
2021-07-28 03:39:33,570 - Trainer - INFO - Checkpoint <voc_1over8_datalist0_deeplabv3+_resnet101.pth> (epoch 63) was loaded
2021-07-28 03:39:33,571 - Trainer - INFO - 
###### EVALUATION ######
2021-07-28 03:40:01,988 - Trainer - INFO - 

2021-07-28 03:40:02,004 - Trainer - INFO -          val_loss       : 0.31989
2021-07-28 03:40:02,009 - Trainer - INFO -          Pixel_Accuracy : 0.939
2021-07-28 03:40:02,015 - Trainer - INFO -          Mean_IoU       : 0.7429999709129333
2021-07-28 03:40:02,025 - Trainer - INFO -          Class_IoU      : {0: 0.932, 1: 0.873, 2: 0.387, 3: 0.852, 4: 0.659, 5: 0.768, 6: 0.926, 7: 0.852, 8: 0.888, 9: 0.326, 10: 0.805, 11: 0.625, 12: 0.842, 13: 0.861, 14: 0.806, 15: 0.839, 16: 0.546, 17: 0.815, 18: 0.49, 19: 0.816, 20: 0.701}

I just download the model from the link we provide at Here. And use the config at Here. You may try again.

As for the RAM, I suggest you to use larger RAM.

@X-Lai X-Lai closed this as completed Jul 28, 2021
@Chic-J
Copy link
Author

Chic-J commented Jul 29, 2021

I use memory profiler to watch the code, and i found
image
is it not memory leak?

@X-Lai
Copy link
Collaborator

X-Lai commented Jul 29, 2021

Sorry, I may not get your point. I think Python has its own GC mechanism, so it's hard to have memory leak error. Can you explain more on your demonstration and why the 'memory leak' happens?

@Chic-J
Copy link
Author

Chic-J commented Jul 29, 2021

as you can see, when the model is training, the RAM becomes like:
image
the RAM is bigger than bigger. I think when trainning, the RAM needed should maintain a steady size

@gpgzy
Copy link

gpgzy commented Aug 17, 2021

I use a single GPU, and found this problem too.

I use a single GPU, and the config like:
{
"name": "CAC",
"experim_name": "voc_cac_deeplabv3+_resnet50_1over8_datalist0",
"dataset": "voc",
"data_dir": "E:\dataset\VOCtrainval_11-May-2012",
"datalist": 0,
"n_gpu": 1,
"n_labeled_examples": 1323,
"diff_lrs": true,
"ramp_up": 0.1,
"unsupervised_w": 30,
"ignore_index": 255,
"lr_scheduler": "Poly",
"use_weak_lables":false,
"weakly_loss_w": 0.4,
"pretrained": true,
"random_seed": 42,

"model":{
    "supervised": false,
    "semi": true,
    "supervised_w": 1,

    "sup_loss": "CE",

    "layers": 50,
    "downsample": true,
    "proj_final_dim": 128,
    "out_dim": 256,
    "backbone": "deeplab_v3+",
    "pos_thresh_value": 0.75,
    "weight_unsup": 0.1,
    "epoch_start_unsup": 5,
    "selected_num": 6400,
    "temp": 0.1,
    "step_save": 1,
    "stride": 8
},


"optimizer": {
    "type": "SGD",
    "args":{
        "lr": 0.01,
        "weight_decay": 1e-4,
        "momentum": 0.9
    }
},

"train_supervised": {
    "batch_size": 8,
    "crop_size": 320,
    "shuffle": true,
    "base_size": 400,
    "scale": true,
    "augment": true,
    "flip": true,
    "rotate": false,
    "blur": false,
    "split": "train_supervised",
    "num_workers": 0
},

"train_unsupervised": {
    "batch_size": 8,
    "crop_size": 320,
    "shuffle": true,
    "base_size": 400,
    "scale": true,
    "augment": true,
    "flip": true,
    "rotate": false,
    "blur": false,
    "split": "train_unsupervised",
    "num_workers": 0,
    "iou_bound": [0.1, 1.0],
    "stride": 8
},

"val_loader": {
    "batch_size": 2,
    "val": true,
    "split": "val",
    "shuffle": false,
    "num_workers": 0
},

"trainer": {
    "epochs": 80,
    "save_dir": "saved/",
    "save_period": 1,

    "monitor": "max Mean_IoU",
    "early_stop": 100,
    
    "tensorboardX": true,
    "log_dir": "saved/",
    "log_per_iter": 20,

    "val": true,
    "val_per_epochs": 1
}

}

我使用了一块显卡来运行代码,在运行代码的时候也遇到了这个错误,请问您现在解决这个问题了吗?

@Chic-J
Copy link
Author

Chic-J commented Aug 17, 2021

I use a single GPU, and found this problem too.

I use a single GPU, and the config like:
{
"name": "CAC",
"experim_name": "voc_cac_deeplabv3+_resnet50_1over8_datalist0",
"dataset": "voc",
"data_dir": "E:\dataset\VOCtrainval_11-May-2012",
"datalist": 0,
"n_gpu": 1,
"n_labeled_examples": 1323,
"diff_lrs": true,
"ramp_up": 0.1,
"unsupervised_w": 30,
"ignore_index": 255,
"lr_scheduler": "Poly",
"use_weak_lables":false,
"weakly_loss_w": 0.4,
"pretrained": true,
"random_seed": 42,

"model":{
    "supervised": false,
    "semi": true,
    "supervised_w": 1,

    "sup_loss": "CE",

    "layers": 50,
    "downsample": true,
    "proj_final_dim": 128,
    "out_dim": 256,
    "backbone": "deeplab_v3+",
    "pos_thresh_value": 0.75,
    "weight_unsup": 0.1,
    "epoch_start_unsup": 5,
    "selected_num": 6400,
    "temp": 0.1,
    "step_save": 1,
    "stride": 8
},


"optimizer": {
    "type": "SGD",
    "args":{
        "lr": 0.01,
        "weight_decay": 1e-4,
        "momentum": 0.9
    }
},

"train_supervised": {
    "batch_size": 8,
    "crop_size": 320,
    "shuffle": true,
    "base_size": 400,
    "scale": true,
    "augment": true,
    "flip": true,
    "rotate": false,
    "blur": false,
    "split": "train_supervised",
    "num_workers": 0
},

"train_unsupervised": {
    "batch_size": 8,
    "crop_size": 320,
    "shuffle": true,
    "base_size": 400,
    "scale": true,
    "augment": true,
    "flip": true,
    "rotate": false,
    "blur": false,
    "split": "train_unsupervised",
    "num_workers": 0,
    "iou_bound": [0.1, 1.0],
    "stride": 8
},

"val_loader": {
    "batch_size": 2,
    "val": true,
    "split": "val",
    "shuffle": false,
    "num_workers": 0
},

"trainer": {
    "epochs": 80,
    "save_dir": "saved/",
    "save_period": 1,

    "monitor": "max Mean_IoU",
    "early_stop": 100,
    
    "tensorboardX": true,
    "log_dir": "saved/",
    "log_per_iter": 20,

    "val": true,
    "val_per_epochs": 1
}

}

我使用了一块显卡来运行代码,在运行代码的时候也遇到了这个错误,请问您现在解决这个问题了吗?

yes, try this:
https://blog.csdn.net/weixin_44156420/article/details/119335057

@2679622694
Copy link

谢谢你的工作。
但是当我尝试在 VOC
`Checkpoint <E:\Context-Aware-Consistency-master\pretrained\voc_1over8_datalist0_deeplabv3+_resnet101.pth> (epoch 63) 中测试代码时发现错误被加载

评估

0%| | 0/724 [00:17<?, ?it/s]
回溯(最近一次调用最后一次):
文件“D:\ProgramData\Anaconda3\envs\cv\lib\runpy.py”,第 193 行,在 _run_module_as_main
main ", mod_spec)
File "D:\ProgramData\Anaconda3\envs\cv\lib\runpy.py", line 85, in _run_code exec(code, run_globals) File "c:\Users\Administrator.vscode\extensions\ms- python.python-2020.7.96456 \ pythonFiles \ lib中\蟒\ debugpy__主__py “为45行,在
cli.main()
文件” C:\ Users \用户Administrator.vscode \扩展\ MS-python.python-2020.7。 96456\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py",
run()
文件“c:\Users\Administrator.vscode\extensions\ms-python.python-2020.7.96456\pythonFiles\lib\python\debugpy/..\debugpy\server\cli.py”,第 267 行,在run_file
runpy.run_path(options.target, run_name=compat.force_str(" main "))
File "D:\ProgramData\Anaconda3\envs\cv\lib\runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name =fname)
文件“D:\ProgramData\Anaconda3\envs\cv\lib\runpy.py”,第 96 行,在 _run_module_code
mod_name, mod_spec, pkg_name, script_name)
文件“D:\ProgramData\Anaconda3\envs\cv\lib \runpy.py”,第 85 行,在 _run_code
exec(code, run_globals)
文件“e:\Context-Aware-Consistency-master\train.py”,第 128 行,在
main(config['n_gpu'], config['n_gpu'], config, args.resume, args.test)
File "e:\Context-Aware-Consistency-master\train.py", line 99, in main
trainer .train()
文件“e:\Context-Aware-Consistency-master\base\base_trainer.py”,第 105 行,训练
结果 = self._valid_epoch(0)
文件“e:\Context-Aware-Consistency-master\ trainer.py", line 145, in _valid_epoch
for batch_idx, (data, target) in enumerate(tbar):
File "D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\tqdm\std.py",第 1185 行,在iter
for obj in iterable:
文件“D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data\dataloader.py”,第 435 行,在下一个
数据 = self._next_data ()
文件“D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data\dataloader.py”,第475行,_next_data
data = self._dataset_fetcher.fetch(index) # 可能会引发StopIteration
File "D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data_utils\fetch.py​​", line 47, in fetch
return self.collat​​e_fn(data)
File "D:\ProgramData\Anaconda3\ envs\cv\lib\site-packages\torch\utils\data_utils\collat​​e.py”,第 83 行,在 default_collat​​e 中
返回 [default_collat​​e(samples) for samples in transposed]
文件“D:\ProgramData\Anaconda3\envs\cv\ lib\site-packages\torch\utils\data_utils\collat​​e.py",第 83 行,作为
回报 [default_collat​​e(samples) for samples in transposed]
文件“D:\ProgramData\Anaconda3\envs\cv\lib\site-packages\torch\utils\data_utils\collat​​e.py”,第 55 行,在 default_collat​​e 中
return torch.stack(batch, 0, out=out)
RuntimeError: stack 期望每个张量的大小相等,但在条目 0 处得到 [3, 375, 500],在条目 1 处得到 [3, 396, 500]`

I use a single GPU, and found this problem too.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants