Tesla v100 for training error #59

54wb · 2022-09-19T08:34:41Z

你好，服务器上有两张卡。一张3090，一张v100,在测试同一个程序时，v100报错但是3090没报错，请问这个是什么问题呢？我尝试了调小batchsize不行，也试了使用其他框架在这张卡是训练是正常的

发生异常: RuntimeError (note: full exception trace is shown but execution is paused at: _run_module_as_main)
�[38;5;1m[f 0919 16:27:01.178698 16 executor.cc:665]
Execute fused operator(41/49) failed.
[JIT Source]: /home/lwb/.cache/jittor/jt1.3.5/g++7.5.0/py3.7.13/Linux-5.15.0-4xe4/IntelRXeonRGolxda/default/cu11.1.74_sm_70_86/jit/cutt_transpose__T_1__JIT_1__JIT_cuda_1__index_t_int32_hash_e6e42f8f6f3e9195_op.cc
[OP TYPE]: cutt_transpose
[Input]: float32[10,238950,],
[Output]: float32[238950,10,],
[Async Backtrace]: not found, please set env JT_SYNC=1, trace_py_var=3
[Reason]: cudaFuncSetSharedMemConfig(transposePacked<float, 1>, cudaSharedMemBankSizeFourByte ) in file /home/lwb/.cache/jittor/cutt/cutt-1.2/src/calls.h:2, function cuttKernelSetSharedMemConfig
Error message: invalid device function�[m

Async error was detected. To locate the async backtrace and get better error report, please rerun your code with two enviroment variables set:

export JT_SYNC=1
export trace_py_var=3
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/site-packages/jittor/contrib.py", line 183, in getitem
return getitem(x, slices.where())
File "/home/lwb/work/code/jdet/python/jdet/models/boxes/assigner.py", line 148, in assign_wrt_overlaps
assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
File "/home/lwb/work/code/jdet/python/jdet/models/boxes/assigner.py", line 108, in assign
assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
File "/home/lwb/work/code/jdet/python/jdet/models/roi_heads/oriented_rpn_head.py", line 314, in _get_targets_single
assign_result = self.assigner.assign(anchors, target_bboxes, target_bboxes_ignore, None if self.sampling else gt_labels)
File "/home/lwb/work/code/jdet/python/jdet/utils/general.py", line 53, in multi_apply
return tuple(map(list, zip(*map_results)))
File "/home/lwb/work/code/jdet/python/jdet/models/roi_heads/oriented_rpn_head.py", line 383, in get_targets
all_bbox_weights, pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply(self._get_targets_single, anchor_list, valid_flag_list, targets)
File "/home/lwb/work/code/jdet/python/jdet/models/roi_heads/oriented_rpn_head.py", line 464, in loss
labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,num_total_pos, num_total_neg = self.get_targets(anchor_list, valid_flag_list, targets)
File "/home/lwb/work/code/jdet/python/jdet/models/roi_heads/oriented_rpn_head.py", line 494, in execute
losses = self.loss(*outs,targets)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/site-packages/jittor/init.py", line 951, in call
return self.execute(*args, **kw)
File "/home/lwb/work/code/jdet/python/jdet/models/networks/cascade_orcnn.py", line 47, in execute
proposals_list, rpn_losses = self.rpn(features,targets)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/site-packages/jittor/init.py", line 951, in call
return self.execute(*args, **kw)
File "/home/lwb/work/code/jdet/python/jdet/runner/runner.py", line 126, in train
losses = self.model(images,targets)
File "/home/lwb/work/code/jdet/python/jdet/runner/runner.py", line 84, in run
self.train()
File "/home/lwb/work/code/jdet/tools/run_net.py", line 47, in main
runner.run()
File "/home/lwb/work/code/jdet/tools/run_net.py", line 56, in
main()
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/lwb/anaconda3/envs/jdet/lib/python3.7/runpy.py", line 193, in _run_module_as_main (Current frame)
"main", mod_spec)

这个是config文件，应该不是代码的问题，因为3090是可以正常跑的

model settings

model = dict(
type='CascadeORCNN',
backbone=dict(
type='Resnet50',
frozen_stages=1,
return_stages=["layer1","layer2","layer3","layer4"],
pretrained= True),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn = dict(
type = "OrientedRPNHead",
in_channels=256,
num_classes=1,
min_bbox_size=0,
nms_thresh=0.8,
nms_pre=2000,
nms_post=2000,
feat_channels=256,
bbox_type='obb',
reg_dim=6,
background_label=0,
reg_decoded_bbox=False,
pos_weight=-1,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='MidpointOffsetCoder',
target_means=[.0, .0, .0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0, 0.5, 0.5]),
loss_cls=dict(type='CrossEntropyLossForRcnn', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1,
match_low_quality=True,
assigned_labels_filled=-1,
),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False)
),
roi_head=dict(
type='OBBCascadeRoIHead',
num_stages=3,
stage_loss_weights=[1,0.5,0.25],
bbox_roi_extractor=dict(
type='OrientedSingleRoIExtractor',
roi_layer=dict(type='ROIAlignRotated_v1', output_size=7, sampling_ratio=2),
out_channels=256,
extend_factor=(1.4, 1.2),
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='SharedFCBBoxHeadRbbox',
start_bbox_type='obb',
end_bbox_type='obb',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=11,
bbox_coder=dict(
type='OrientedDeltaXYWHTCoder',
target_means=[0., 0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss'),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='SharedFCBBoxHeadRbbox',
start_bbox_type='obb',
end_bbox_type='obb',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=11,
bbox_coder=dict(
type='OrientedDeltaXYWHTCoder',
target_means=[0., 0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss'),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='SharedFCBBoxHeadRbbox',
start_bbox_type='obb',
end_bbox_type='obb',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=11,
bbox_coder=dict(
type='OrientedDeltaXYWHTCoder',
target_means=[0., 0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss'),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0))]
),
train_cfg = dict(
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1,
iou_calculator=dict(type='BboxOverlaps2D_rotated_v1')),
sampler=dict(
type='RandomSamplerRotated',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
match_low_quality=False,
ignore_iof_thr=-1,
iou_calculator=dict(type='BboxOverlaps2D_rotated_v1')),
sampler=dict(
type='RandomSamplerRotated',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.7,
min_pos_iou=0.7,
match_low_quality=False,
ignore_iof_thr=-1,
iou_calculator=dict(type='BboxOverlaps2D_rotated_v1')),
sampler=dict(
type='RandomSamplerRotated',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)
])
)

dataset = dict(
train=dict(
type="FAIR1M_1_5_Dataset",
dataset_dir='/home/lwb/work/code/jdet/data/Fair1m1_5/split_aug_ss/train_1024_200_1.0',
transforms=[
dict(
type="RotatedResize",
min_size=1024,
max_size=1024
),
dict(
type='RotatedRandomFlip',
direction="horizontal",
prob=0.5),
dict(
type='RotatedRandomFlip',
direction="vertical",
prob=0.5),
# dict(
# type="RandomRotateAug",
# random_rotate_on=True,
# ),
dict(
type = "Pad",
size_divisor=32),
dict(
type = "Normalize",
mean = [123.675, 116.28, 103.53],
std = [58.395, 57.12, 57.375],
to_bgr=False,)

    ],
    batch_size=1,
    num_workers=4,
    shuffle=True,
    filter_empty_gt=False,
    balance_category=False
),
val=dict(
    type="FAIR1M_1_5_Dataset",
    dataset_dir='/home/lwb/work/code/jdet/data/Fair1m1_5/split_aug_ss/train_1024_200_1.0',
    transforms=[
        dict(
            type="RotatedResize",
            min_size=1024,
            max_size=1024
        ),
        dict(
            type = "Pad",
            size_divisor=32),
        dict(
            type = "Normalize",
            mean =  [123.675, 116.28, 103.53],
            std = [58.395, 57.12, 57.375],
            to_bgr=False,),
    ],
    batch_size=2,
    num_workers=4,
    shuffle=False
),
test=dict(
    type="ImageDataset",
    images_dir='/home/lwb/work/code/jdet/data/Fair1m1_5/split_aug_ss/test_1024_200_1.0/images',
    transforms=[
        dict(
            type="RotatedResize",
            min_size=1024,
            max_size=1024
        ),
        dict(   
            type = "Pad",
            size_divisor=32),
        dict(
            type = "Normalize",
            mean =  [123.675, 116.28, 103.53],
            std = [58.395, 57.12, 57.375],
            to_bgr=False,),
    ],
    dataset_type="FAIR1M_1_5",
    num_workers=4,
    batch_size=1,
)

)

optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0001, grad_clip=dict(max_norm=35, norm_type=2))

scheduler = dict(
type='StepLR',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
milestones=[7, 10])

logger = dict(
type="RunLogger")

when we the trained model from cshuan, image is rgb

max_epoch = 12
eval_interval = 100
checkpoint_interval = 1
log_interval = 50

The text was updated successfully, but these errors were encountered:

54wb · 2022-09-19T08:36:02Z

在jittor的论讨上也看到了这个问题，但是没人解决

chenjw1009 · 2023-03-15T05:22:18Z

楼主怎么解决的啊

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Tesla v100 for training error #59

Tesla v100 for training error #59

54wb commented Sep 19, 2022

54wb commented Sep 19, 2022

chenjw1009 commented Mar 15, 2023

Tesla v100 for training error #59

Tesla v100 for training error #59

Comments

54wb commented Sep 19, 2022

model settings

when we the trained model from cshuan, image is rgb

54wb commented Sep 19, 2022

chenjw1009 commented Mar 15, 2023