## YOLO V2

In [1]:
import torch
import cv2
import numpy as np
import json
import glob
import os

from darknet_v2 import Darknet
from util_model_img import load_classes, letterbox_image, image_to_tensor, predict_transform_v2, write_results

dataset = "coco"
stride = 32
confidence = 0.5
nms_thresh = 0.4
CUDA = torch.cuda.is_available()

inp_dim = 416
num_classes = 80
classes = load_classes('data/coco.names')
weightsfile = 'model_weights/yolo.weights'
cfgfile = "cfg/yolo.cfg"

model = Darknet(cfgfile)
model.load_weights(weightsfile)
if CUDA:
    model = model.cuda()


In [None]:
import time
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval


def detection(val_anno, img_dir, results_json, class_id, fps=False):

    '''
    val_anno: annotations of validation set images
    img_dir: dir of validation set images
    results_json: detection result
    class_id: HA target class - person (0) and AA target class - stop sign (11)
    '''

    file2id = dict()
    with open(val_anno) as f:
        dic = json.load(f)
        images = dic['images']
        for i in range(len(images)):
            image = images[i]
            file2id[image['file_name']] = image['id']

    #imgs = glob.glob("%s/*.png" % img_dir) + glob.glob("%s/*.jpg" % img_dir)
    imgs = [f for f in os.listdir(img_dir) if f.endswith('.jpg') or f.endswith('.png')]

    results = []

    if fps:
        start = time.time()

    for img in imgs:

        image_id = file2id[img]

        img_cv = cv2.imread(os.path.join(img_dir, img))
        img_h, img_w, _ = img_cv.shape
        img_cv = letterbox_image(img_cv, [inp_dim, inp_dim])
        im_dim = torch.FloatTensor((img_w, img_h)).repeat(1,2)
        img_ts = image_to_tensor(img_cv)

        if CUDA:
            img_ts = img_ts.cuda()
            im_dim = im_dim.cuda()
        
        prediction = model(img_ts).data
        prediction = predict_transform_v2(prediction, inp_dim, model.anchors, num_classes, stride, confidence, CUDA)
        output = write_results(prediction, confidence, num_classes, nms=True, nms_conf=nms_thresh)

        if type(output) != int:
            im_dim = im_dim.repeat(output.size(0), 1)
            scaling_factor = torch.min(inp_dim/im_dim,1)[0].view(-1,1).cuda()
            output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1))/2
            output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1))/2
            output[:,1:5] /= scaling_factor
            for i in range(output.shape[0]):
                output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim[i,0])
                output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim[i,1])
                conf = output[i, 5].float().item()
                cls = output[i, -1].int().item()
                x = output[i, 1].int().item()
                y = output[i, 2].int().item()
                w = (output[i, 3] - output[i, 1]).float().item()
                h = (output[i, 4] - output[i, 2]).float().item()

                #print(image_id, cls, [xc, yc, w, h], conf)

                results.append({'image_id': image_id,
                                'category_id': cls,
                                'bbox': [x, y, w, h],
                                'score': conf})
        
    if fps:
        end = time.time()
        total_time = end-start
        print("%d imgs, %f seconds, average: %f fps" % (len(imgs), total_time, len(imgs)/total_time))

    with open(results_json, 'w') as f:
        f.write(json.dumps(results, indent=4))


    cocoGt = COCO(val_anno)
    cocoDt = cocoGt.loadRes(results_json)
    cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
    cocoEval.params.catIds = [class_id]
    cocoEval.evaluate()
    cocoEval.accumulate()
    cocoEval.summarize()



### INRIA 数据集


In [3]:
detection(
    val_anno="Dataset/HA/test_annotations.json",
    img_dir="Dataset/HA/INRIA/pos/",
    results_json="test_map/yolov2_inria_test.json",
    class_id=0,
    fps=True
)

288 imgs, 10.149807 seconds, average: 28.374923 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.10s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.454
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.847
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.466
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.217
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.498
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.312
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.540
 Average Rec


#### Adv Patch

In [4]:

detection(
    val_anno="Dataset/HA/test_adv_annotations.json",
    img_dir="Dataset/HA/INRIA/patch_v2/",
    results_json="test_map/yolov2_inria_patchv2.json",
    class_id=0,
    fps=True
)

288 imgs, 6.683018 seconds, average: 43.094304 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.08s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.089
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.240
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.047
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.097
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.110
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.093
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.161
 Average Recal

#### Adv Cloak

In [6]:

detection(
    val_anno="Dataset/HA/test_adv_annotations.json",
    img_dir="Dataset/HA/INRIA/cloak_v2/",
    results_json="test_map/yolov2_inria_cloakv2.json",
    class_id=0,
    fps=True
)

288 imgs, 6.374250 seconds, average: 45.181783 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.07s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.136
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.330
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.085
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.006
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.125
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.185
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.140
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.215
 Average Recal

#### Adv Tshirt

In [7]:

detection(
    val_anno="Dataset/HA/test_adv_annotations.json",
    img_dir="Dataset/HA/INRIA/tshirt_v2/",
    results_json="test_map/yolov2_inria_tshirtv2.json",
    class_id=0,
    fps=True
)

288 imgs, 6.500369 seconds, average: 44.305178 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.07s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.173
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.399
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.109
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.125
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.262
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.168
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.261
 Average Recal

#### Natural Patch

In [8]:

detection(
    val_anno="Dataset/HA/test_adv_annotations.json",
    img_dir="Dataset/HA/INRIA/natural_v2/",
    results_json="test_map/yolov2_inria_naturalv2.json",
    class_id=0,
    fps=True
)

288 imgs, 6.785481 seconds, average: 42.443563 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.08s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.216
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.476
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.153
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.003
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.199
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.261
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.173
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.300
 Average Recal

### AA

In [3]:
detection(
    val_anno="Dataset/AA/stop_test_annotations.json",
    img_dir="Dataset/AA/imgs_s/",
    results_json="test_map/yolov2_aa_stopsign.json",
    class_id=11,
    fps=True
)

1000 imgs, 26.070091 seconds, average: 38.358132 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.38s).
Accumulating evaluation results...
DONE (t=0.05s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.631
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.900
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.805
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.620
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.637
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.668
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.668
 Average Re

In [4]:
detection(
    val_anno="Dataset/AA/person_test_annotations.json",
    img_dir="Dataset/AA/imgs_p/",
    results_json="test_map/yolov2_aa_person.json",
    class_id=0,
    fps=True
)

1000 imgs, 24.380700 seconds, average: 41.016050 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.22s).
Accumulating evaluation results...
DONE (t=0.06s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.545
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.972
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.580
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.455
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.625
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.592
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.598
 Average Re

## YOLO V3

In [1]:
import time
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import torch
import cv2
import numpy as np
import json
import glob

from darknet_v3 import Darknet as Darknet53
from util_model_img import load_classes, letterbox_image, image_to_tensor, write_result

confidence = 0.5
nms_thresh = 0.4
num_classes = 80
inp_dim = 416
device="cuda"
device = torch.device(device)
weightsfile = "model_weights/yolov3.weights"
cfgfile = "cfg/yolov3.cfg"

model = Darknet53(device, cfgfile)
model.load_weights(weightsfile)
model.net_info["height"] = inp_dim
model = model.to(device)
model.eval()

Darknet(
  (module_list): ModuleList(
    (0): Sequential(
      (conv_0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batch_norm_0): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (leaky_0): LeakyReLU(negative_slope=0.1, inplace=True)
    )
    (1): Sequential(
      (conv_1): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (batch_norm_1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (leaky_1): LeakyReLU(negative_slope=0.1, inplace=True)
    )
    (2): Sequential(
      (conv_2): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (batch_norm_2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (leaky_2): LeakyReLU(negative_slope=0.1, inplace=True)
    )
    (3): Sequential(
      (conv_3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batch

In [None]:
import time
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval


def detection_v3(val_anno, img_dir, results_json, class_id, CUDA=True, fps=False):

    '''
    val_anno: annotations of validation set images
    img_dir: dir of validation set images
    results_json: detection result
    class_id: HA target class - person (0) and AA target class - stop sign (11)
    '''

    file2id = dict()
    with open(val_anno) as f:
        dic = json.load(f)
        images = dic['images']
        for i in range(len(images)):
            image = images[i]
            file2id[image['file_name']] = image['id']

    #imgs = glob.glob("%s/*.png" % img_dir) + glob.glob("%s/*.jpg" % img_dir)
    imgs = [f for f in os.listdir(img_dir) if f.endswith('.jpg') or f.endswith('.png')]

    results = []

    if fps:
        start = time.time()

    for img in imgs:

        image_id = file2id[img]

        img_cv = cv2.imread(os.path.join(img_dir, img))
        img_h, img_w, _ = img_cv.shape
        img_cv = letterbox_image(img_cv, [inp_dim, inp_dim])
        im_dim = torch.FloatTensor((img_w, img_h)).repeat(1,2)
        img_ts = image_to_tensor(img_cv)

        if CUDA:
            img_ts = img_ts.cuda()
            im_dim = im_dim.cuda()
        
        prediction = model(img_ts).data
        output = write_result(prediction, confidence, num_classes, nms=True, nms_conf=nms_thresh)

        if type(output) != int:
            im_dim = im_dim.repeat(output.size(0), 1)
            scaling_factor = torch.min(inp_dim/im_dim,1)[0].view(-1,1).cuda()
            output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1))/2
            output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1))/2
            output[:,1:5] /= scaling_factor
            for i in range(output.shape[0]):
                output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim[i,0])
                output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim[i,1])
                conf = output[i, 5].float().item()
                cls = output[i, -1].int().item()
                x = output[i, 1].int().item()
                y = output[i, 2].int().item()
                w = (output[i, 3] - output[i, 1]).float().item()
                h = (output[i, 4] - output[i, 2]).float().item()

                #print(image_id, cls, [xc, yc, w, h], conf)

                results.append({'image_id': image_id,
                                'category_id': cls,
                                'bbox': [x, y, w, h],
                                'score': conf})
        
    if fps:
        end = time.time()
        total_time = end-start
        print("%d imgs, %f seconds, average: %f fps" % (len(imgs), total_time, len(imgs)/total_time))

    with open(results_json, 'w') as f:
        f.write(json.dumps(results, indent=4))


    cocoGt = COCO(val_anno)
    cocoDt = cocoGt.loadRes(results_json)
    cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
    cocoEval.params.catIds = [class_id]
    cocoEval.evaluate()
    cocoEval.accumulate()
    cocoEval.summarize()



### INRIA

In [3]:
detection_v3(
    val_anno="Dataset/HA/test_annotations.json",
    img_dir="Dataset/HA/INRIA/pos/",
    results_json="test_map/yolov3_inria.json",
    class_id=0,
    fps=True
)

288 imgs, 13.358694 seconds, average: 21.558994 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.11s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.587
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.961
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.640
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.505
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.601
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.350
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.678
 Average Rec

In [4]:
detection_v3(
    val_anno="Dataset/HA/test_adv_annotations.json",
    img_dir="Dataset/HA/INRIA/patch_v3/",
    results_json="test_map/yolov3_inria_patchv3.json",
    class_id=0,
    fps=True
)

288 imgs, 11.480503 seconds, average: 25.086009 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.08s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.157
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.341
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.114
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.078
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.177
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.224
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.153
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.290
 Average Reca

In [5]:
detection_v3(
    val_anno="Dataset/HA/test_adv_annotations.json",
    img_dir="Dataset/HA/INRIA/natural_v3/",
    results_json="test_map/yolov2_inria_naturalv3.json",
    class_id=0,
    fps=True
)

288 imgs, 11.305015 seconds, average: 25.475419 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.08s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.218
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.448
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.178
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.052
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.173
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.328
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.204
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.363
 Average Reca

### AA 攻击

In [6]:
detection_v3(
    val_anno="Dataset/AA/stop_test_annotations.json",
    img_dir="Dataset/AA/imgs_s/",
    results_json="test_map/yolov3_aa_stopsign.json",
    class_id=11,
    fps=True
)

1000 imgs, 39.602939 seconds, average: 25.250651 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.12s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.20s).
Accumulating evaluation results...
DONE (t=0.04s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.609
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.806
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.791
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.756
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.464
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.649
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.654
 Average Re

In [7]:
detection_v3(
    val_anno="Dataset/AA/person_test_annotations.json",
    img_dir="Dataset/AA/imgs_p/",
    results_json="test_map/yolov3_aa_person.json",
    class_id=0,
    fps=True
)

1000 imgs, 41.375729 seconds, average: 24.168759 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.35s).
Accumulating evaluation results...
DONE (t=0.05s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.736
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.952
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.881
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.637
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.830
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.787
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.806
 Average Re

## YOLO V4

In [1]:
import torch
import cv2
import numpy as np
import json
import glob

from tools.cocotools import get_classes
from model.yolov4 import YOLOv4
from model.decode_np import Decode
from util_model_img import letterbox_image, image_to_tensor


confidence = 0.5
nms_thresh = 0.4
num_anchors = 3
num_classes = 80
inp_dim = 416
device="cuda"
device = torch.device(device)
all_classes = get_classes("data/coco.names")
num_classes = len(all_classes)
yolo = YOLOv4(num_classes, num_anchors)
yolo = yolo.to(device)
yolo.load_state_dict(torch.load("model_weights/pytorch_yolov4_1.pt"))
yolo.eval()
model = Decode(confidence, nms_thresh, (inp_dim, inp_dim), yolo, all_classes, True)


In [None]:
import time
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval


def detection_v4(val_anno, img_dir, results_json, class_id, CUDA=True, fps=False):
    
    '''
    val_anno: annotations of validation set images
    img_dir: dir of validation set images
    results_json: detection result
    class_id: HA target class - person (0) and AA target class - stop sign (11)
    '''

    file2id = dict()
    with open(val_anno) as f:
        dic = json.load(f)
        images = dic['images']
        for i in range(len(images)):
            image = images[i]
            file2id[image['file_name']] = image['id']

    #imgs = glob.glob("%s/*.png" % img_dir) + glob.glob("%s/*.jpg" % img_dir)
    imgs = [f for f in os.listdir(img_dir) if f.endswith('.jpg') or f.endswith('.png')]

    results = []

    if fps:
        start = time.time()

    for img in imgs:

        image_id = file2id[img]

        img_cv = cv2.imread(os.path.join(img_dir, img))
        img_h, img_w, _ = img_cv.shape
        img_cv = letterbox_image(img_cv, [inp_dim, inp_dim]).astype(np.uint8)
        im_dim = torch.FloatTensor((img_w, img_h)).repeat(1,2)
        #img_ts = image_to_tensor(img_cv)
        #print(img_cv.shape)
        image, boxes, scores, classes = model.detect_image(img_cv, draw_image=False)
        
        if boxes is None:
            pass
        else:
            output = np.concatenate((boxes, np.expand_dims(scores,axis=-1), np.expand_dims(classes,axis=-1)), axis=1)

            im_dim = im_dim.repeat(output.shape[0], 1)
            scaling_factor = torch.min(inp_dim/im_dim,1)[0].view(-1,1).cpu().numpy()
            output[:,[0,2]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1).cpu().numpy())/2
            output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1).cpu().numpy())/2
            output[:,0:4] /= scaling_factor

            for i in range(output.shape[0]):

                conf = output[i, 4]
                cls = output[i, -1]
                x = output[i, 0]
                y = output[i, 1]
                w = (output[i, 2] - output[i, 0])
                h = (output[i, 3] - output[i, 1])

                #print(image_id, cls, [xc, yc, w, h], conf)

                results.append({'image_id': image_id,
                                'category_id': cls,
                                'bbox': [x, y, w, h],
                                'score': conf})
        
    if fps:
        end = time.time()
        total_time = end-start
        print("%d imgs, %f seconds, average: %f fps" % (len(imgs), total_time, len(imgs)/total_time))

    with open(results_json, 'w') as f:
        f.write(json.dumps(results, indent=4))


    cocoGt = COCO(val_anno)
    cocoDt = cocoGt.loadRes(results_json)
    cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
    cocoEval.params.catIds = [class_id]
    cocoEval.evaluate()
    cocoEval.accumulate()
    cocoEval.summarize()

### INRIA 数据集

In [3]:
detection_v4(
    val_anno="Dataset/HA/test_annotations.json",
    img_dir="Dataset/HA/INRIA/pos/",
    results_json="test_map/yolov4_inria.json",
    class_id=0,
    fps=True
)

  return 1 / (1 + np.exp(-x))


288 imgs, 17.571758 seconds, average: 16.389937 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.12s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.626
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.948
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.710
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.496
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.648
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.361
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.702
 Average Rec

In [4]:
detection_v4(
    val_anno="Dataset/HA/test_adv_annotations.json",
    img_dir="Dataset/HA/INRIA/patch_v4/",
    results_json="test_map/yolov4_inria_patchv4.json",
    class_id=0,
    fps=True
)

288 imgs, 15.070274 seconds, average: 19.110469 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.10s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.239
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.473
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.236
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.110
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.204
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.366
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.209
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.391
 Average Reca

In [5]:
detection_v4(
    val_anno="Dataset/HA/test_adv_annotations.json",
    img_dir="Dataset/HA/INRIA/natural_v4/",
    results_json="test_map/yolov4_inria_naturalv4.json",
    class_id=0,
    fps=True
)

288 imgs, 14.958608 seconds, average: 19.253129 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.13s).
Accumulating evaluation results...
DONE (t=0.02s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.490
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.810
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.532
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.179
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.472
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.558
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.313
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.606
 Average Reca

### AA 攻击

In [6]:
detection_v4(
    val_anno="Dataset/AA/stop_test_annotations.json",
    img_dir="Dataset/AA/imgs_s/",
    results_json="test_map/yolov4_aa_stopsign.json",
    class_id=11,
    fps=True
)

1000 imgs, 51.884302 seconds, average: 19.273652 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.30s).
Accumulating evaluation results...
DONE (t=0.08s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.793
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.927
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.927
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.813
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.792
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.837
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.838
 Average Re

In [7]:
detection_v4(
    val_anno="Dataset/AA/person_test_annotations.json",
    img_dir="Dataset/AA/imgs_p/",
    results_json="test_map/yolov4_aa_person.json",
    class_id=0,
    fps=True
)

1000 imgs, 52.520471 seconds, average: 19.040195 fps
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.12s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=0.27s).
Accumulating evaluation results...
DONE (t=0.06s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.734
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.885
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.876
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.652
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.843
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.798
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.841
 Average Re