In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import sys
sys.path.append('../src')  
from detection_util import create_predictions
from gdsc_score import get_leaderboard_score
from gdsc_util import download_directory, download_file, load_sections_df, set_up_logging, PROJECT_DIR
from PredictionEvaluator import PredictionEvaluator
pd.options.display.max_rows = 150
from ensemble_boxes import *

In [2]:
def scale_box(df):
    df['x1'] = df.apply(lambda x : x['xmin']/x['width'], axis = 1)
    df['y1'] = df.apply(lambda x : x['ymin']/x['height'], axis = 1)
    df['x2'] = df.apply(lambda x : x['xmax']/x['width'], axis = 1)
    df['y2'] = df.apply(lambda x : x['ymax']/x['height'], axis = 1)
    
    return df.copy()

def de_scale_box(df):
    df['xmin'] = df.apply(lambda x : np.round(x['xmin']*x['width']), axis = 1)
    df['ymin'] = df.apply(lambda x : np.round(x['ymin']*x['height']), axis = 1)
    df['xmax'] = df.apply(lambda x : np.round(x['xmax']*x['width']), axis = 1)
    df['ymax'] = df.apply(lambda x : np.round(x['ymax']*x['height']), axis = 1)
    
    return df.copy()

def bulid_section_id(x):

    name = "{}@{}-{}-{}-{}"
    return name.format(x['file_name'], x['xmin'], x['xmax'],  x['ymin'], x['ymax'])  

def get_score(test):

    # finding the best confidence score
    ground_truth = load_sections_df(f'actual_test.csv')
    confidence_score = np.arange(0.35,0.7,0.05)
    score_dict ={}
    score_df = pd.DataFrame(columns = ['confidence', 'board_score', 'ground_labels_count', 
                                       'actual_count_worms', 'pred_count_worms' ])


    for i in tqdm(confidence_score):

        cf_df = test[test['detection_score'] >=i]

        evaluator = PredictionEvaluator(ground_truth)
        thresholds = [0.5, 0.6, 0.7]
        score_json = get_leaderboard_score(cf_df, thresholds, evaluator,detailed_evaluation = False )
        score_dict[i] = score_json
        score_df.loc[len(score_df)] = [i, score_json['score'], ground_truth.shape[0], test.shape[0], cf_df.shape[0] ]

    return score_df  

In [19]:
# model_a = pd.read_csv('../Model_evaluation/preds/exp_4_a_24_test.csv')
model_a = pd.read_csv('Ensemble_4a_4b.csv')
Model_b = pd.read_csv('../Model_evaluation/preds/exp_5_b_20_test.csv')
actual_data = pd.read_csv('actual_test.csv', sep = ';')

In [20]:
actual_data = actual_data[['file_name', 'height', 'width' ]].drop_duplicates().copy()
model_a = pd.merge(model_a, actual_data, on = 'file_name')
model_b = pd.merge(Model_b, actual_data, on = 'file_name')

In [21]:
if 'width' not in model_a.columns:
    model_a = model_a.rename(columns = {'width_x' : 'width' , 'height_x' : 'height' })

In [22]:
model_a = scale_box(model_a)
model_b = scale_box(model_b)

In [23]:
model_a_lst = list(model_a[['x1', 'y1', 'x2', 'y2']].values)
model_b_lst = list(model_b[['x1', 'y1', 'x2', 'y2']].values)

In [24]:

df_holder = []
all_images = actual_data['file_name'].unique()
for img in all_images:
    
    temp_df_a = model_a[model_a['file_name'] == img]
    temp_df_b = model_b[model_b['file_name'] == img]
    
    # sanity check
    if not (temp_df_a['height'].values[0] ==  temp_df_b['height'].values[0]) and \
    (temp_df_a['width'].values[0] ==  temp_df_b['width'].values[0]):
        raise ValueError("something went wrong")
    
    
    lst_temp_a = list(temp_df_a[['x1', 'y1', 'x2', 'y2']].values)
    lst_temp_b = list(temp_df_b[['x1', 'y1', 'x2', 'y2']].values)
    
    cf_a = temp_df_a['detection_score'].values
    cf_b = temp_df_b['detection_score'].values
    
    labels_a = np.repeat(1, temp_df_a.shape[0])
    labels_b = np.repeat(1, temp_df_b.shape[0])

    
    boxes_list = [lst_temp_a, lst_temp_b]
    scores_list = [cf_a, cf_b]
    labels_list = [labels_a, labels_b]
    
    weights = [1, 2]
    iou_thr = 0.5
    skip_box_thr = 0.0001
    sigma = 0.1
    
    boxes, scores, labels = weighted_boxes_fusion(boxes_list, scores_list, labels_list, 
                                                  weights=weights, iou_thr=iou_thr, skip_box_thr=skip_box_thr)
    
    # rebuilding df
    rebuild_df = pd.DataFrame(boxes, columns = ['xmin', 'ymin', 'xmax', 'ymax'])
    rebuild_df['width'] = temp_df_a['width'].values[0]
    rebuild_df['height'] = temp_df_a['height'].values[0]
    rebuild_df['detection_score'] = scores
    rebuild_df['file_name'] = img
    rebuild_df = de_scale_box(rebuild_df)
    rebuild_df['section_id'] = rebuild_df.apply(lambda x : bulid_section_id(x), axis=1)
    df_holder.append(rebuild_df)
    
ensemble_df = pd.concat(df_holder, axis = 0)
    
ensemble_df.to_csv('temp.csv', index= False)
ensemble_df = pd.read_csv('temp.csv')
 


In [9]:
get_score(ensemble_df) # 4A + 4B

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:15<00:00, 10.74s/it]


Unnamed: 0,confidence,board_score,ground_labels_count,actual_count_worms,pred_count_worms
0,0.35,236.6,6220.0,9594.0,6662.0
1,0.4,238.74,6220.0,9594.0,6523.0
2,0.45,240.64,6220.0,9594.0,6417.0
3,0.5,241.88,6220.0,9594.0,6300.0
4,0.55,243.21,6220.0,9594.0,6181.0
5,0.6,243.73,6220.0,9594.0,6061.0
6,0.65,243.19,6220.0,9594.0,5925.0


In [10]:

# # 4A + 4B
# ensemble_df.to_csv('Ensemble_4a_4b.csv', index =  False)

In [26]:
ensemble_df.to_csv('Ensemble_4a_4b_5b.csv', index =  False)