In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install ensemble-boxes

In [None]:
import numpy as np
import pandas as pd

In [None]:
import numpy as np
import pandas as pd
import time
from multiprocessing import Pool, Process, cpu_count, Manager
from ensemble_boxes import *

def process_single_id(id, res_boxes, weights, params):
    run_type = params['run_type']
    verbose = params['verbose']

    # print('Go for ID: {}'.format(id))
    boxes_list = []
    scores_list = []
    labels_list = []
    labels_to_use_forward = dict()
    labels_to_use_backward = dict()

    for i in range(len(res_boxes[id])):
        boxes = []
        scores = []
        labels = []

        dt = res_boxes[id][i]

        for j in range(0, len(dt)):
            lbl = dt[j][5]
            scr = float(dt[j][4])
            box_x1 = float(dt[j][0]/1024)
            box_y1 = float(dt[j][1]/1024)
            box_x2 = float(dt[j][2]/1024)
            box_y2 = float(dt[j][3]/1024)

            if box_x1 >= box_x2:
                if verbose:
                    print('Problem with box x1 and x2: {}. Skip it'.format(dt[j]))
                continue
            if box_y1 >= box_y2:
                if verbose:
                    print('Problem with box y1 and y2: {}. Skip it'.format(dt[j]))
                continue
            if scr <= 0:
                if verbose:
                    print('Problem with box score: {}. Skip it'.format(dt[j]))
                continue

            boxes.append([box_x1, box_y1, box_x2, box_y2])
            scores.append(scr)
            if lbl not in labels_to_use_forward:
                cur_point = len(labels_to_use_forward)
                labels_to_use_forward[lbl] = cur_point
                labels_to_use_backward[cur_point] = lbl
            labels.append(labels_to_use_forward[lbl])

        boxes = np.array(boxes, dtype=np.float32)
        scores = np.array(scores, dtype=np.float32)
        labels = np.array(labels, dtype=np.int32)

        boxes_list.append(boxes)
        scores_list.append(scores)
        labels_list.append(labels)

    # Empty predictions for all models
    if len(boxes_list) == 0:
        return np.array([]), np.array([]), np.array([])

    if run_type == 'wbf':
        merged_boxes, merged_scores, merged_labels = weighted_boxes_fusion(boxes_list, scores_list, labels_list,
                                                                       weights=weights, iou_thr=params['intersection_thr'],
                                                                       skip_box_thr=params['skip_box_thr'],
                                                                           conf_type=params['conf_type'])
    elif run_type == 'nms':
        iou_thr = params['iou_thr']
        merged_boxes, merged_scores, merged_labels = nms(boxes_list, scores_list, labels_list, weights=weights, iou_thr=iou_thr)
    elif run_type == 'soft-nms':
        iou_thr = params['iou_thr']
        sigma = params['sigma']
        thresh = params['thresh']
        merged_boxes, merged_scores, merged_labels = soft_nms(boxes_list, scores_list, labels_list,
                                                              weights=weights, iou_thr=iou_thr, sigma=sigma, thresh=thresh)
    elif run_type == 'nmw':
        merged_boxes, merged_scores, merged_labels = non_maximum_weighted(boxes_list, scores_list, labels_list,
                                                                       weights=weights, iou_thr=params['intersection_thr'],
                                                                       skip_box_thr=params['skip_box_thr'])

    # print(len(boxes_list), len(merged_boxes))
    if 'limit_boxes' in params:
        limit_boxes = params['limit_boxes']
        if len(merged_boxes) > limit_boxes:
            merged_boxes = merged_boxes[:limit_boxes]
            merged_scores = merged_scores[:limit_boxes]
            merged_labels = merged_labels[:limit_boxes]

    # Rename labels back
    merged_labels_string = []
    for m in merged_labels:
        merged_labels_string.append(labels_to_use_backward[m])
    merged_labels = np.array(merged_labels_string, dtype=np.str)

    # Create IDs array
    ids_list = [id] * len(merged_labels)

    return merged_boxes.copy(), merged_scores.copy(), merged_labels.copy(), ids_list.copy()


def process_part_of_data(proc_number, return_dict, ids_to_use, res_boxes, weights, params):
    print('Start process: {} IDs to proc: {}'.format(proc_number, len(ids_to_use)))
    result = []
    for id in ids_to_use:
        merged_boxes, merged_scores, merged_labels, ids_list = process_single_id(id, res_boxes, weights, params)
        # print(merged_boxes.shape, merged_scores.shape, merged_labels.shape, len(ids_list))
        result.append((merged_boxes, merged_scores, merged_labels, ids_list))
    return_dict[proc_number] = result.copy()


def ensemble_predictions(pred_filenames, weights, params):
    verbose = False
    if 'verbose' in params:
        verbose = params['verbose']

    start_time = time.time()
    procs_to_use = max(cpu_count() // 2, 1)
    # procs_to_use = 6
    print('Use processes: {}'.format(procs_to_use))
    weights = np.array(weights)

    res_boxes = dict()
    ref_ids = None
    for j in range(len(pred_filenames)):
        if weights[j] == 0:
            continue
        print('Read {}...'.format(pred_filenames[j]))
        s = pd.read_csv(pred_filenames[j], dtype={'image_id': np.str, 'class': np.str})
        s.sort_values('image_id', inplace=True)
        s.reset_index(drop=True, inplace=True)
        ids = s['image_id'].values
        unique_ids = sorted(s['image_id'].unique())
        if ref_ids is None:
            ref_ids = tuple(unique_ids)
        else:
            if ref_ids != tuple(unique_ids):
                print('Different IDs in ensembled CSVs! {} != {}'.format(len(ref_ids), len(unique_ids)))
                s = s[s['image_id'].isin(ref_ids)]
                s.sort_values('image_id', inplace=True)
                s.reset_index(drop=True, inplace=True)
                ids = s['image_id'].values
        preds = s[['xmin', 'ymin', 'xmax', 'ymax', 'score', 'class']].values
        single_res = dict()
        for i in range(len(ids)):
            id = ids[i]
            if id not in single_res:
                single_res[id] = []
            single_res[id].append(preds[i])
        for el in single_res:
            if el not in res_boxes:
                res_boxes[el] = []
            res_boxes[el].append(single_res[el])

    # Reduce weights if needed
    weights = weights[weights != 0]
    ids_to_use = sorted(list(res_boxes.keys()))
    manager = Manager()
    return_dict = manager.dict()
    jobs = []
    for i in range(procs_to_use):
        start = i * len(ids_to_use) // procs_to_use
        end = (i+1) * len(ids_to_use) // procs_to_use
        if i == procs_to_use - 1:
            end = len(ids_to_use)
        p = Process(target=process_part_of_data, args=(i, return_dict, ids_to_use[start:end], res_boxes, weights, params))
        jobs.append(p)
        p.start()

    for i in range(len(jobs)):
        jobs[i].join()
    print('helloooooooooooo')
    results = []
    for i in range(len(jobs)):
        results += return_dict[i]

    # p = Pool(processes=procs_to_use)
    # results = p.starmap(process_single_id, zip(ids_to_use, repeat(weights), repeat(params)))

    all_ids = []
    all_boxes = []
    all_scores = []
    all_labels = []
    for boxes, scores, labels, ids_list in results:
        if boxes is None:
            continue
        all_boxes.append(boxes)
        all_scores.append(scores)
        all_labels.append(labels)
        all_ids.append(ids_list)

    all_ids = np.concatenate(all_ids)
    all_boxes = np.concatenate(all_boxes)
    all_scores = np.concatenate(all_scores)
    all_labels = np.concatenate(all_labels)
    if verbose:
        print(all_ids.shape, all_boxes.shape, all_scores.shape, all_labels.shape)

    res = pd.DataFrame(all_ids, columns=['image_id'])
    res['class'] = all_labels
    res['score'] = all_scores
    res['xmin'] = all_boxes[:, 0]*1024
    res['ymin'] = all_boxes[:, 1] * 1024
    res['xmax'] = all_boxes[:, 2]*1024
    res['ymax'] = all_boxes[:, 3]*1024
    res['width'] = all_boxes[:, 3]*0+1024
    res['height'] = all_boxes[:, 3] * 0 + 1024
    print('Run time: {:.2f}'.format(time.time() - start_time))
    return res


def ensemble(benchmark_csv, weights, params,expno):
    ensemble_preds = ensemble_predictions(benchmark_csv, weights, params)
    ensemble_preds.to_csv("/content/drive/MyDrive/Dhaka-AI 2020/handlabelEnsemble/handensemble"+str(expno)+".csv", index=False)

In [None]:
import pandas as pd 
import io
import numpy as np

In [None]:
def ensemble_conf(conf_threshold, expno):
  df = pd.read_csv("/content/drive/MyDrive/Dhaka-AI 2020/handlabelEnsemble/handensemble"+str(expno)+".csv") 
  filename = "handensemble"+str(expno)

  test_result = df
  test_result['xmin'] = test_result['xmin'].astype(int)
  test_result['ymin'] = test_result['ymin'].astype(int)
  test_result['xmax'] = test_result['xmax'].astype(int)
  test_result['ymax'] = test_result['ymax'].astype(int)
  test_result['width'] = test_result['width'].astype(int)
  test_result['height'] = test_result['height'].astype(int)

  test_result = test_result.drop_duplicates(subset=['image_id','xmin','ymin','xmax','ymax'])

  conf_thres = conf_threshold
  test_result = test_result[test_result['score']>conf_thres]
  test_result = test_result.reset_index()
  test_result = test_result.drop(columns=['index'])

  test_result.to_csv("/content/drive/MyDrive/Dhaka-AI 2020/handlabelEnsemble/"+filename+"_retouched.csv",index=False)

In [None]:
def ensemble_normalized(conf_thres):
  filename = 'final_output'
  df = pd.read_csv("/content/drive/MyDrive/Dhaka-AI 2020/handlabelEnsemble/final_output.csv") 
  
  pd.options.mode.chained_assignment = None  # default='warn'
  test_result = df
  for i in range(test_result['xmin'].size):
    test_result['xmin'].iloc[i] = test_result['xmin'].iloc[i]/1024
    test_result['ymin'].iloc[i] = test_result['ymin'].iloc[i]/1024
    test_result['xmax'].iloc[i] = test_result['xmax'].iloc[i]/1024
    test_result['ymax'].iloc[i] = test_result['ymax'].iloc[i]/1024
  test_result = test_result.drop(columns=['width','height'])
  test_result.to_csv("/content/drive/MyDrive/Dhaka-AI 2020/handlabelEnsemble/final_output.csv",index = False)

In [None]:
def compute_overlap(boxes, query_boxes):
    """
    Args
        a: (N, 4) ndarray of float
        b: (K, 4) ndarray of float
    Returns
        overlaps: (N, K) ndarray of overlap between boxes and query_boxes
    """
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    overlaps = np.zeros((N, K), dtype=np.float64)
    for k in range(K):
        box_area = (
            (query_boxes[k, 2] - query_boxes[k, 0]) *
            (query_boxes[k, 3] - query_boxes[k, 1])
        )
        for n in range(N):
            iw = (
                min(boxes[n, 2], query_boxes[k, 2]) -
                max(boxes[n, 0], query_boxes[k, 0])
            )
            if iw > 0:
                ih = (
                    min(boxes[n, 3], query_boxes[k, 3]) -
                    max(boxes[n, 1], query_boxes[k, 1])
                )
                if ih > 0:
                    ua = np.float64(
                        (boxes[n, 2] - boxes[n, 0]) *
                        (boxes[n, 3] - boxes[n, 1]) +
                        box_area - iw * ih
                    )
                    overlaps[n, k] = iw * ih / ua
    return overlaps

In [None]:
def get_real_annotations(table):
    res = dict()
    ids = table['image_id'].values.astype(np.str)
    labels = table['class'].values.astype(np.str)
    xmin = table['xmin'].values.astype(np.float32)
    xmax = table['xmax'].values.astype(np.float32)
    ymin = table['ymin'].values.astype(np.float32)
    ymax = table['ymax'].values.astype(np.float32)

    for i in range(len(ids)):
        id = ids[i]
        label = labels[i]
        if id not in res:
            res[id] = dict()
        if label not in res[id]:
            res[id][label] = []
        box = [xmin[i], ymin[i], xmax[i], ymax[i]]
        res[id][label].append(box)

    return res


def get_detections(table):
    res = dict()
    ids = table['image_id'].values.astype(np.str)
    labels = table['class'].values.astype(np.str)
    scores = table['score'].values.astype(np.float32)
    xmin = table['xmin'].values.astype(np.float32)
    xmax = table['xmax'].values.astype(np.float32)
    ymin = table['ymin'].values.astype(np.float32)
    ymax = table['ymax'].values.astype(np.float32)

    for i in range(len(ids)):
        id = ids[i]
        label = labels[i]
        if id not in res:
            res[id] = dict()
        if label not in res[id]:
            res[id][label] = []
        box = [xmin[i], ymin[i], xmax[i], ymax[i], scores[i]]
        res[id][label].append(box)

    return res


def _compute_ap(recall, precision):
    """ Compute the average precision, given the recall and precision curves.
    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
    # Arguments
        recall:    The recall curve (list).
        precision: The precision curve (list).
    # Returns
        The average precision as computed in py-faster-rcnn.
    """
    # correct AP calculation
    # first append sentinel values at the end
    mrec = np.concatenate(([0.], recall, [1.]))
    mpre = np.concatenate(([0.], precision, [0.]))

    # compute the precision envelope
    for i in range(mpre.size - 1, 0, -1):
        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

    # to calculate area under PR curve, look for points
    # where X axis (recall) changes value
    i = np.where(mrec[1:] != mrec[:-1])[0]

    # and sum (\Delta recall) * prec
    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap


def mean_average_precision_for_boxes(ann, pred, iou_threshold=0.5, exclude_not_in_annotations=False, verbose=True):
    """
    :param ann: path to CSV-file with annotations or numpy array of shape (N, 6)
    :param pred: path to CSV-file with predictions (detections) or numpy array of shape (N, 7)
    :param iou_threshold: IoU between boxes which count as 'match'. Default: 0.5
    :param exclude_not_in_annotations: exclude image IDs which are not exist in annotations. Default: False
    :param verbose: print detailed run info. Default: True
    :return: tuple, where first value is mAP and second values is dict with AP for each class.
    """

    if isinstance(ann, str):
        valid = pd.read_csv(ann)
    else:
        valid = pd.DataFrame(ann, columns=['image_id', 'class', 'xmin', 'ymin', 'xmax', 'ymax'])

    if isinstance(pred, str):
        preds = pd.read_csv(pred)
    else:
        preds = pd.DataFrame(pred, columns=['image_id', 'class', 'xmin', 'ymin', 'xmax', 'ymax'])

    ann_unique = valid['image_id'].unique()
    preds_unique = preds['image_id'].unique()
    total_ground_truth_boxes = valid.shape[0]
    false_positives = 0
    true_positives = 0

    if verbose:
        print('Number of files in annotations: {}'.format(len(ann_unique)))
        print('Number of files in predictions: {}'.format(len(preds_unique)))


    # Exclude files not in annotations!
    if exclude_not_in_annotations:
        preds = preds[preds['image_id'].isin(ann_unique)]
        preds_unique = preds['image_id'].unique()
        if verbose:
            print('Number of files in detection after reduction: {}'.format(len(preds_unique)))

    unique_classes = valid['class'].unique().astype(np.str)
    if verbose:
        print('Unique classes: {}'.format(len(unique_classes)))

    all_detections = get_detections(preds)
    all_annotations = get_real_annotations(valid)
    if verbose:
        print('Detections length: {}'.format(len(all_detections)))
        print('Annotations length: {}'.format(len(all_annotations)))

    average_precisions = {}
    for zz, label in enumerate(sorted(unique_classes)):

        # Negative class
        if str(label) == 'nan':
            continue

        scores = []
        num_annotations = 0.0

        for i in range(len(ann_unique)):
            detections = []
            annotations = []
            id = ann_unique[i]
            if id in all_detections:
                if label in all_detections[id]:
                    detections = all_detections[id][label]
            if id in all_annotations:
                if label in all_annotations[id]:
                    annotations = all_annotations[id][label]

            if len(detections) == 0 and len(annotations) == 0:
                continue

            num_annotations += len(annotations)
            detected_annotations = []

            annotations = np.array(annotations, dtype=np.float64)
            for d in detections:
                scores.append(d[4])

                if len(annotations) == 0:
                    false_positives = false_positives + 1
                    continue

                overlaps = compute_overlap(np.expand_dims(np.array(d, dtype=np.float64), axis=0), annotations)
                assigned_annotation = np.argmax(overlaps, axis=1)
                max_overlap = overlaps[0, assigned_annotation]

                if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
                    true_positives = true_positives + 1
                    detected_annotations.append(assigned_annotation)
                else:
                    false_positives = false_positives + 1
                   

        if num_annotations == 0:
            average_precisions[label] = 0, 0
            continue

    false_negatives = total_ground_truth_boxes - true_positives
    mean_ap = (true_positives) / (true_positives + false_positives + false_negatives)    
    return mean_ap

In [None]:
def calculate_map():
  annotations_file = '/content/drive/MyDrive/Dhaka-AI 2020/handlabelEnsemble/handlabel ground truth.csv'
  detections_file = '/content/drive/MyDrive/Dhaka-AI 2020/handlabelEnsemble/final_output.csv'

  mAP = 0
  mAP = mAP + mean_average_precision_for_boxes(annotations_file, detections_file, 0.5)
  mAP = mAP + mean_average_precision_for_boxes(annotations_file, detections_file, 0.55)
  mAP = mAP + mean_average_precision_for_boxes(annotations_file, detections_file, 0.6)
  mAP = mAP + mean_average_precision_for_boxes(annotations_file, detections_file, 0.65)
  mAP = mAP + mean_average_precision_for_boxes(annotations_file, detections_file, 0.7)
  mAP = mAP + mean_average_precision_for_boxes(annotations_file, detections_file, 0.75)
  mAP = mAP / 6
  print('MAP: ')
  print(mAP)

In [None]:
ensemble_normalized(0.2)
calculate_map()


Number of files in annotations: 149
Number of files in predictions: 146
Unique classes: 18
Detections length: 146
Annotations length: 149
Number of files in annotations: 149
Number of files in predictions: 146
Unique classes: 18
Detections length: 146
Annotations length: 149
Number of files in annotations: 149
Number of files in predictions: 146
Unique classes: 18
Detections length: 146
Annotations length: 149
Number of files in annotations: 149
Number of files in predictions: 146
Unique classes: 18
Detections length: 146
Annotations length: 149
Number of files in annotations: 149
Number of files in predictions: 146
Unique classes: 18
Detections length: 146
Annotations length: 149
Number of files in annotations: 149
Number of files in predictions: 146
Unique classes: 18
Detections length: 146
Annotations length: 149
MAP: 
0.3147721239337679


In [None]:
  params = {
      'run_type': 'wbf',
      'skip_box_thr': 0.001,
      'intersection_thr': 0.5,
      'conf_type': 'avg',
      'limit_boxes': 30000,
      'verbose': False,
  }
  '''eff5_512_e66_set1.csv	    eff6_512_e71_set1.csv   yolov5_set1_exp32.csv
  eff5_640_e63_set1.csv	    yolov5_set1_exp123.csv  yolov5_set1_exp60.csv
  eff5_prevbest_e69_set1.csv  yolov5_set1_exp155.csv  yolov5_set1_exp63.csv'''
  in_dir = '/content/drive/MyDrive/Dhaka-AI 2020/handlabelEnsemble/'
  benchmark_csv = [
        in_dir + 'TestdataHandlabel_pred_eff_model_1.csv',
        in_dir + 'TestdataHandlabel_pred_eff_model_2.csv',
        in_dir + 'TestdataHandlabel_pred_eff_model_3.csv',
        in_dir + 'TestdataHandlabel_pred_eff_model_4.csv',
        in_dir + 'hand_exp32.csv',
        in_dir + 'hand_exp60.csv',
        in_dir + 'hand_exp63.csv',
        in_dir + 'hand_exp123.csv',
        in_dir + 'hand_exp155.csv',
  ]
  expno=1
  for w in [1]:
    weights = [15,15,45,30,5,5,15,25,20]
    print(benchmark_csv)
    print(weights)
    assert(len(benchmark_csv) == len(weights))
    ensemble(benchmark_csv, weights, params, expno)
    ensemble_conf(0.2, expno)
    ensemble_normalized(expno,0.2)
    calculate_map(expno)
    expno+=1