Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tesla v100 for training error #59

Open
54wb opened this issue Sep 19, 2022 · 2 comments
Open

Tesla v100 for training error #59

54wb opened this issue Sep 19, 2022 · 2 comments

Comments

@54wb
Copy link

54wb commented Sep 19, 2022

你好,服务器上有两张卡。一张3090,一张v100,在测试同一个程序时,v100报错但是3090没报错,请问这个是什么问题呢?我尝试了调小batchsize不行,也试了使用其他框架在这张卡是训练是正常的

发生异常: RuntimeError (note: full exception trace is shown but execution is paused at: _run_module_as_main)
�[38;5;1m[f 0919 16:27:01.178698 16 executor.cc:665]
Execute fused operator(41/49) failed.
[JIT Source]: /home/lwb/.cache/jittor/jt1.3.5/g++7.5.0/py3.7.13/Linux-5.15.0-4xe4/IntelRXeonRGolxda/default/cu11.1.74_sm_70_86/jit/cutt_transpose__T_1__JIT_1__JIT_cuda_1__index_t_int32_hash_e6e42f8f6f3e9195_op.cc
[OP TYPE]: cutt_transpose
[Input]: float32[10,238950,],
[Output]: float32[238950,10,],
[Async Backtrace]: not found, please set env JT_SYNC=1, trace_py_var=3
[Reason]: cudaFuncSetSharedMemConfig(transposePacked<float, 1>, cudaSharedMemBankSizeFourByte ) in file /home/lwb/.cache/jittor/cutt/cutt-1.2/src/calls.h:2, function cuttKernelSetSharedMemConfig
Error message: invalid device function�[m


Async error was detected. To locate the async backtrace and get better error report, please rerun your code with two enviroment variables set:

export JT_SYNC=1
export trace_py_var=3
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/site-packages/jittor/contrib.py", line 183, in getitem
return getitem(x, slices.where())
File "/home/lwb/work/code/jdet/python/jdet/models/boxes/assigner.py", line 148, in assign_wrt_overlaps
assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
File "/home/lwb/work/code/jdet/python/jdet/models/boxes/assigner.py", line 108, in assign
assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
File "/home/lwb/work/code/jdet/python/jdet/models/roi_heads/oriented_rpn_head.py", line 314, in _get_targets_single
assign_result = self.assigner.assign(anchors, target_bboxes, target_bboxes_ignore, None if self.sampling else gt_labels)
File "/home/lwb/work/code/jdet/python/jdet/utils/general.py", line 53, in multi_apply
return tuple(map(list, zip(*map_results)))
File "/home/lwb/work/code/jdet/python/jdet/models/roi_heads/oriented_rpn_head.py", line 383, in get_targets
all_bbox_weights, pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply(self._get_targets_single, anchor_list, valid_flag_list, targets)
File "/home/lwb/work/code/jdet/python/jdet/models/roi_heads/oriented_rpn_head.py", line 464, in loss
labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,num_total_pos, num_total_neg = self.get_targets(anchor_list, valid_flag_list, targets)
File "/home/lwb/work/code/jdet/python/jdet/models/roi_heads/oriented_rpn_head.py", line 494, in execute
losses = self.loss(*outs,targets)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/site-packages/jittor/init.py", line 951, in call
return self.execute(*args, **kw)
File "/home/lwb/work/code/jdet/python/jdet/models/networks/cascade_orcnn.py", line 47, in execute
proposals_list, rpn_losses = self.rpn(features,targets)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/site-packages/jittor/init.py", line 951, in call
return self.execute(*args, **kw)
File "/home/lwb/work/code/jdet/python/jdet/runner/runner.py", line 126, in train
losses = self.model(images,targets)
File "/home/lwb/work/code/jdet/python/jdet/runner/runner.py", line 84, in run
self.train()
File "/home/lwb/work/code/jdet/tools/run_net.py", line 47, in main
runner.run()
File "/home/lwb/work/code/jdet/tools/run_net.py", line 56, in
main()
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 193, in _run_module_as_main (Current frame)
"main", mod_spec)

这个是config文件,应该不是代码的问题,因为3090是可以正常跑的

model settings

model = dict(
type='CascadeORCNN',
backbone=dict(
type='Resnet50',
frozen_stages=1,
return_stages=["layer1","layer2","layer3","layer4"],
pretrained= True),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn = dict(
type = "OrientedRPNHead",
in_channels=256,
num_classes=1,
min_bbox_size=0,
nms_thresh=0.8,
nms_pre=2000,
nms_post=2000,
feat_channels=256,
bbox_type='obb',
reg_dim=6,
background_label=0,
reg_decoded_bbox=False,
pos_weight=-1,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='MidpointOffsetCoder',
target_means=[.0, .0, .0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0, 0.5, 0.5]),
loss_cls=dict(type='CrossEntropyLossForRcnn', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1,
match_low_quality=True,
assigned_labels_filled=-1,
),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False)
),
roi_head=dict(
type='OBBCascadeRoIHead',
num_stages=3,
stage_loss_weights=[1,0.5,0.25],
bbox_roi_extractor=dict(
type='OrientedSingleRoIExtractor',
roi_layer=dict(type='ROIAlignRotated_v1', output_size=7, sampling_ratio=2),
out_channels=256,
extend_factor=(1.4, 1.2),
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='SharedFCBBoxHeadRbbox',
start_bbox_type='obb',
end_bbox_type='obb',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=11,
bbox_coder=dict(
type='OrientedDeltaXYWHTCoder',
target_means=[0., 0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss'),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='SharedFCBBoxHeadRbbox',
start_bbox_type='obb',
end_bbox_type='obb',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=11,
bbox_coder=dict(
type='OrientedDeltaXYWHTCoder',
target_means=[0., 0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss'),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='SharedFCBBoxHeadRbbox',
start_bbox_type='obb',
end_bbox_type='obb',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=11,
bbox_coder=dict(
type='OrientedDeltaXYWHTCoder',
target_means=[0., 0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss'),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0))]
),
train_cfg = dict(
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1,
iou_calculator=dict(type='BboxOverlaps2D_rotated_v1')),
sampler=dict(
type='RandomSamplerRotated',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
match_low_quality=False,
ignore_iof_thr=-1,
iou_calculator=dict(type='BboxOverlaps2D_rotated_v1')),
sampler=dict(
type='RandomSamplerRotated',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.7,
min_pos_iou=0.7,
match_low_quality=False,
ignore_iof_thr=-1,
iou_calculator=dict(type='BboxOverlaps2D_rotated_v1')),
sampler=dict(
type='RandomSamplerRotated',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)
])
)

dataset = dict(
train=dict(
type="FAIR1M_1_5_Dataset",
dataset_dir='/home/lwb/work/code/jdet/data/Fair1m1_5/split_aug_ss/train_1024_200_1.0',
transforms=[
dict(
type="RotatedResize",
min_size=1024,
max_size=1024
),
dict(
type='RotatedRandomFlip',
direction="horizontal",
prob=0.5),
dict(
type='RotatedRandomFlip',
direction="vertical",
prob=0.5),
# dict(
# type="RandomRotateAug",
# random_rotate_on=True,
# ),
dict(
type = "Pad",
size_divisor=32),
dict(
type = "Normalize",
mean = [123.675, 116.28, 103.53],
std = [58.395, 57.12, 57.375],
to_bgr=False,)

    ],
    batch_size=1,
    num_workers=4,
    shuffle=True,
    filter_empty_gt=False,
    balance_category=False
),
val=dict(
    type="FAIR1M_1_5_Dataset",
    dataset_dir='/home/lwb/work/code/jdet/data/Fair1m1_5/split_aug_ss/train_1024_200_1.0',
    transforms=[
        dict(
            type="RotatedResize",
            min_size=1024,
            max_size=1024
        ),
        dict(
            type = "Pad",
            size_divisor=32),
        dict(
            type = "Normalize",
            mean =  [123.675, 116.28, 103.53],
            std = [58.395, 57.12, 57.375],
            to_bgr=False,),
    ],
    batch_size=2,
    num_workers=4,
    shuffle=False
),
test=dict(
    type="ImageDataset",
    images_dir='/home/lwb/work/code/jdet/data/Fair1m1_5/split_aug_ss/test_1024_200_1.0/images',
    transforms=[
        dict(
            type="RotatedResize",
            min_size=1024,
            max_size=1024
        ),
        dict(   
            type = "Pad",
            size_divisor=32),
        dict(
            type = "Normalize",
            mean =  [123.675, 116.28, 103.53],
            std = [58.395, 57.12, 57.375],
            to_bgr=False,),
    ],
    dataset_type="FAIR1M_1_5",
    num_workers=4,
    batch_size=1,
)

)

optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0001, grad_clip=dict(max_norm=35, norm_type=2))

scheduler = dict(
type='StepLR',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
milestones=[7, 10])

logger = dict(
type="RunLogger")

when we the trained model from cshuan, image is rgb

max_epoch = 12
eval_interval = 100
checkpoint_interval = 1
log_interval = 50

@54wb
Copy link
Author

54wb commented Sep 19, 2022

在jittor的论讨上也看到了这个问题,但是没人解决

@chenjw1009
Copy link

楼主怎么解决的啊

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants