In [1]:
import boto3
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
from io import BytesIO
import ast
import random
import PIL.Image as Image
import time

from functools import partial
from aquabyte.lib.db.snowflake import snowflake_query_to_df
SNOWFLAKE_DSN = '/dsn/snowflake/mochi'
snowflake_query_to_df = partial(snowflake_query_to_df, ssm_name=SNOWFLAKE_DSN)

import os

In [2]:
ROOT_DIR = "/workspace/mnt/"
TRAIN_SAVE_PATH = f"{ROOT_DIR}data/train2017/"
VAL_SAVE_PATH = f"{ROOT_DIR}data/val2017/"
ANNOTATION_SAVE_PATH = f"{ROOT_DIR}data/annotations/"
VAL_FRAC = 0.1
os.makedirs(TRAIN_SAVE_PATH, exist_ok=True)
os.makedirs(VAL_SAVE_PATH, exist_ok=True)
os.makedirs(ANNOTATION_SAVE_PATH, exist_ok=True)

# latest plali queues
PLALI_QUEUE_NAMES = [
    "fish_detector_visibility_belsvik_and_ras_f3r",
    "fish_bbox_fish_bbox_laksefjord_smolt_ras",
    "fish_bbox_smolt_ras_gtsf_thumbnails_2024_01_19",
    #"fish_bbox_imr_austevoll_jellyfish",
    "fish_bbox_fish_bbox_100day_sampled",
    #"fish_bbox_novasea_slaughter_line",
    #"fish_bbox_synthetic_images",
    
    #"fish_bbox_toy_fish",
    #"fish_bbox_penflix_plali_samples",
    #"fish_bbox_in_air_gtsf",
]

# these constants are used to load the dataset used to train the previous model
OLD_BUCKET = "s3://aquabyte-frames-resized-inbound/"
NEW_BUCKET = "s3://aquabyte-research/pwais/mft-pg/datasets_s3/high_recall_fish1/images/"
S3_CSV_PATH = "s3://aquabyte-research/pwais/mft-pg/datasets_s3/high_recall_fish1/hrf_with_keypoint_visibility_dataset.csv"

category_id_to_name_json = '{"0": "HIGH", "1": "LOW", "2": "MEDIUM", "3": "PARTIAL"}' 
id_to_category = json.loads(category_id_to_name_json)
category_to_id = dict((c, i) for i, c in id_to_category.items())

s3 = boto3.client('s3')

## load annotations from PLALI

In [3]:
# make the additional sql using PLALI_QUEUE_NAMES
selection_criteria = " or ".join([f"startswith(pw.name, '{q}')" for q in PLALI_QUEUE_NAMES])

sql=f"""   
select
    --load json and select first element of pi.images
    pi.images[0]::varchar as images,
    pa.annotation:annotations as annotation,
    -- find if the annotation is a skip by checking if skipReasons exist as a key
    pa.annotation:skipReasons is not null as is_skip,
    pa.plali_image_id,
    --pa.annotator_email,
    --pa.annotation_time,
    pw.name
from
    prod.plali_workflows as pw
    join prod.plali_images as pi on pi.workflow_id = pw.id
    left join prod.plali_annotations as pa on pa.plali_image_id = pi.id
where true
    and is_skip = false
    and ({selection_criteria})
"""

df_annotations = snowflake_query_to_df(sql)
df_annotations['annotation'] = df_annotations['annotation'].apply(lambda x: json.loads(x) if x is not None else [])
# change the name by removing the suffix, anything that comes after the last underscore
df_annotations['name'] = df_annotations['name'].apply(lambda x: x.rsplit('_q', 1)[0])

Unnamed: 0,images,annotation,is_skip,plali_image_id,name
0,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 188...",False,adf5f53c-5cc1-4430-9f16-a64d8f1f6e31,fish_detector_visibility_belsvik_and_ras_f3r_q...
1,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 276...",False,8cdf64f9-f1fa-4dc9-bd5e-9d5bc802132a,fish_detector_visibility_belsvik_and_ras_f3r_q...
2,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 818...",False,262009bb-704c-4536-82c4-9fe64806501c,fish_detector_visibility_belsvik_and_ras_f3r_q...
3,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 709...",False,9b86dae4-c4ca-4d98-913d-7e50a62e984a,fish_detector_visibility_belsvik_and_ras_f3r_q...
4,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 724...",False,055ae01c-4ba5-4b2f-94f4-df2786f6e172,fish_detector_visibility_belsvik_and_ras_f3r_q...
...,...,...,...,...,...
4860,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 171...",False,5dd00ae1-20d7-45a1-9d47-0c0a7aa80f90,fish_detector_visibility_belsvik_and_ras_f3r_q...
4861,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 887...",False,b8ca6f9b-d225-42b2-a7ec-78ab84d436af,fish_detector_visibility_belsvik_and_ras_f3r_q...
4862,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 124...",False,9bf98f7b-7c0a-4e5c-a736-d56c9f8f8ab3,fish_detector_visibility_belsvik_and_ras_f3r_q...
4863,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 682...",False,d9edb3d7-4045-4261-98f8-2a550e5b4512,fish_detector_visibility_belsvik_and_ras_f3r_q...


## load the original high recall fish detector training set

In [4]:
# # use boto to load a csv file from s3
bucket, key = S3_CSV_PATH[5:].split('/', 1)
obj = s3.get_object(Bucket=bucket, Key=key)
df_hr_dataset = pd.read_csv(BytesIO(obj['Body'].read()))
# load the literals using ast for the following columns: images, metadata, label_set, original_annotation
df_hr_dataset['images'] = df_hr_dataset['images'].apply(lambda x: ast.literal_eval(x)[0])
df_hr_dataset['metadata'] = df_hr_dataset['metadata'].apply(lambda x: ast.literal_eval(x))
df_hr_dataset['label_set'] = df_hr_dataset['label_set'].apply(lambda x: ast.literal_eval(x))
df_hr_dataset['original_annotation'] = df_hr_dataset['original_annotation'].apply(lambda x: ast.literal_eval(x))
df_hr_dataset['annotation'] = df_hr_dataset['annotation'].apply(lambda x: ast.literal_eval(x))
df_hr_dataset['pen_id'] = df_hr_dataset['images'].apply(lambda x: x.split('/pen-id=')[1].split('/')[0])
df_hr_dataset['captured_at'] = df_hr_dataset['images'].apply(lambda x: x.split('/at=')[1].split('/')[0])

# replace the image path with the new bucket in images
df_hr_dataset['images'] = df_hr_dataset['images'].apply(lambda x: x.replace(OLD_BUCKET, NEW_BUCKET))
df_hr_dataset['is_skip'] = False

#   drop columns starting with "Unnamed"
df_hr_dataset = df_hr_dataset.loc[:, ~df_hr_dataset.columns.str.contains('^Unnamed')]

Unnamed: 0,id,plali_image_id,original_annotation,annotator_email,plali_image_id.1,images,metadata,workflow_id,name,label_set,full_count,full_and_partial_count,annotation,path_on_quad,pen_id,captured_at,is_skip
0,1865758d-36c7-4422-a408-a4a5c0dadfd6,7ec50a97-7ef8-43e6-851f-0bbcca5a8bb2,"{'isPartial': True, 'annotations': [{'label': ...",labeler41@cogitotech.com,7ec50a97-7ef8-43e6-851f-0bbcca5a8bb2,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"{'tags': ['medium_turbidity'], 'crops': [{'id'...",00000000-0000-0000-0000-000000000112,fish_detection_v2,"{'labelSets': [{'name': 'FISH', 'type': 'bbox'...",2,12,"[{'label': 'HIGH', 'width': 408, 'xCrop': 54, ...",/data8tb/biomass-retrain/Image00000.jpg,56,2020-07-06T13:23:13.686606000Z,False
1,b58fb82e-23bf-4329-a333-2f470d2bc8ba,501bc9e3-c6da-4295-8277-c147ca15a4cf,"{'isPartial': True, 'annotations': [{'label': ...",labeler39@cogitotech.com,501bc9e3-c6da-4295-8277-c147ca15a4cf,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"{'tags': ['medium_turbidity'], 'crops': [{'id'...",00000000-0000-0000-0000-000000000112,fish_detection_v2,"{'labelSets': [{'name': 'FISH', 'type': 'bbox'...",2,6,"[{'label': 'PARTIAL', 'width': 177, 'xCrop': 2...",/data8tb/biomass-retrain/Image00001.jpg,56,2020-07-09T08:17:06.626281000Z,False
2,239df62a-664e-40ba-ba3b-0b0ac04efb85,8a1a3d87-8a2d-47e5-8670-337fb9d317dc,"{'isPartial': True, 'annotations': [{'label': ...",labeler32@cogitotech.com,8a1a3d87-8a2d-47e5-8670-337fb9d317dc,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"{'tags': ['medium_turbidity'], 'crops': [], 'p...",00000000-0000-0000-0000-000000000112,fish_detection_v2,"{'labelSets': [{'name': 'FISH', 'type': 'bbox'...",1,3,"[{'label': 'PARTIAL', 'width': 161, 'xCrop': 0...",/data8tb/biomass-retrain/Image00002.jpg,100,2020-07-13T16:25:59.040183000Z,False
3,77e5b95e-ebb9-4444-813d-3894a2452956,9288fc60-ba5b-4da3-a147-b45235ebc826,"{'isPartial': True, 'annotations': [{'label': ...",labeler39@cogitotech.com,9288fc60-ba5b-4da3-a147-b45235ebc826,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"{'tags': ['low_turbidity', 'small_fish'], 'cro...",00000000-0000-0000-0000-000000000112,fish_detection_v2,"{'labelSets': [{'name': 'FISH', 'type': 'bbox'...",5,25,"[{'label': 'PARTIAL', 'width': 208, 'xCrop': 3...",/data8tb/biomass-retrain/Image00003.jpg,173,2021-01-03T15:04:32.449639000Z,False
4,a1ab3c2e-dcef-423f-a041-01fd971c996e,57d6a0f8-2a1e-4587-8883-c6948a2ea069,"{'isPartial': True, 'annotations': [{'label': ...",labeler36@cogitotech.com,57d6a0f8-2a1e-4587-8883-c6948a2ea069,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"{'tags': ['high_turbidity'], 'crops': [{'id': ...",00000000-0000-0000-0000-000000000112,fish_detection_v2,"{'labelSets': [{'name': 'FISH', 'type': 'bbox'...",1,7,"[{'label': 'PARTIAL', 'width': 414, 'xCrop': 9...",/data8tb/biomass-retrain/Image00004.jpg,95,2020-06-22T09:57:03.681510000Z,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10748,26237944-1e11-4f4f-8667-dc9d0796f56b,409ac351-7017-4d1a-89f0-7d8a127aa192,{'skipReasons': ['is_dark']},labeler41@cogitotech.com,409ac351-7017-4d1a-89f0-7d8a127aa192,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"{'tags': ['low_turbidity'], 'crops': [], 'pen_...",00000000-0000-0000-0000-000000000112,fish_detection_v2,"{'labelSets': [{'name': 'FISH', 'type': 'bbox'...",0,0,[],/data8tb/biomass-retrain/Image10748.jpg,194,2021-01-03T16:19:24.080397000Z,False
10749,73161391-987b-472b-9243-2fd3a1932a63,e6691b4e-a676-4cf4-9049-7002a8d5be70,"{'isPartial': False, 'annotations': [{'label':...",labeler39@cogitotech.com,e6691b4e-a676-4cf4-9049-7002a8d5be70,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"{'tags': ['medium_turbidity', 'trout'], 'crops...",00000000-0000-0000-0000-000000000112,fish_detection_v2,"{'labelSets': [{'name': 'FISH', 'type': 'bbox'...",5,6,"[{'label': 'MEDIUM', 'width': 214, 'xCrop': 18...",/data8tb/biomass-retrain/Image10749.jpg,86,2020-06-10T14:16:17.568247000Z,False
10750,f2220bfd-5eec-449c-bcc5-1a1f6f8b4420,23306b76-40e7-40b5-bc64-7c9c14581db0,"{'isPartial': True, 'annotations': [{'label': ...",labeler41@cogitotech.com,23306b76-40e7-40b5-bc64-7c9c14581db0,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"{'tags': ['medium_turbidity', 'trout'], 'crops...",00000000-0000-0000-0000-000000000112,fish_detection_v2,"{'labelSets': [{'name': 'FISH', 'type': 'bbox'...",5,9,"[{'label': 'PARTIAL', 'width': 235, 'xCrop': 2...",/data8tb/biomass-retrain/Image10750.jpg,86,2020-06-10T14:19:23.612808000Z,False
10751,f763d581-d1a9-4579-a539-e53d7a0a9339,97f8b345-0dce-4e26-8def-7fed3daab825,"{'isPartial': True, 'annotations': [{'label': ...",labeler41@cogitotech.com,97f8b345-0dce-4e26-8def-7fed3daab825,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"{'tags': ['high_turbidity'], 'crops': [{'id': ...",00000000-0000-0000-0000-000000000112,fish_detection_v2,"{'labelSets': [{'name': 'FISH', 'type': 'bbox'...",3,6,"[{'label': 'HIGH', 'width': 203, 'xCrop': 246,...",/data8tb/biomass-retrain/Image10751.jpg,4,2020-06-13T07:30:19.091491000Z,False


In [5]:
# #check if original annotation is the same as annotation, which is indeed the case
# df_hr_dataset['is_same'] = df_hr_dataset.apply(lambda x: x['original_annotation'].get('annotations', []) == x['annotation'], axis=1)

# # get subset of the dataset where original_annotation is not the same as annotation
# df_hr_dataset_not_same = df_hr_dataset[~df_hr_dataset['is_same']]
# df_hr_dataset_not_same

## merge datasets, download all images, create coco jsons

In [6]:
# concat the two dataframes, keeping only the columns that are in df_annotations
df_hr_dataset = df_hr_dataset[df_annotations.columns]
df_hr_dataset = pd.concat([df_hr_dataset, df_annotations], ignore_index=True)
# sort by plali_image_id
df_hr_dataset = df_hr_dataset.sort_values(by='plali_image_id').reset_index(drop=True)

Unnamed: 0,images,annotation,is_skip,plali_image_id,name
0,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"[{'label': 'PARTIAL', 'width': 207, 'xCrop': 3...",False,00000e98-e5cc-4788-a144-bd1fbd13822f,fish_detection_v2
1,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"[{'label': 'MEDIUM', 'width': 217, 'xCrop': 23...",False,0007dcbf-4210-4e75-b53f-a54dd892fb0d,fish_detection_v2
2,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"[{'label': 'HIGH', 'width': 244, 'xCrop': 203,...",False,0010c967-c5b3-4e17-a1da-4a9e1c4b9f09,fish_detection_v2
3,s3://aquabyte-research/pwais/mft-pg/datasets_s...,"[{'label': 'PARTIAL', 'width': 307, 'xCrop': 0...",False,00119e52-5dda-4b42-8dc2-4b760b4f1321,fish_detection_v2
4,s3://aquabyte-datasets-images/aquabyte-frames-...,"[{'category': 'fish_visibility', 'height': 101...",False,0018c809-742e-4b31-8ade-718238561bd3,fish_detector_visibility_belsvik_and_ras_f3r_q...
...,...,...,...,...,...
15613,s3://aquabyte-datasets-images/aquabyte-frames-...,[],False,,fish_detector_visibility_belsvik_and_ras_f3r_q...
15614,s3://aquabyte-datasets-images/aquabyte-frames-...,[],False,,fish_detector_visibility_belsvik_and_ras_f3r_q...
15615,s3://aquabyte-datasets-images/aquabyte-frames-...,[],False,,fish_detector_visibility_belsvik_and_ras_f3r_q...
15616,s3://aquabyte-datasets-images/aquabyte-frames-...,[],False,,fish_detector_visibility_belsvik_and_ras_f3r_q...


In [7]:
# download the images from images column to the local machine, save them in the IMAGE_SAVE_PATH
local_paths = []
not_found = []
is_val = []
rand = random.Random(1337)

for i, row in tqdm(df_hr_dataset.iterrows(), total=df_hr_dataset.shape[0]):
    is_val_image = rand.random() < VAL_FRAC
    remote_path = row['images']
    file_extension = remote_path.split('.')[-1]
    if is_val_image:
        local_path = f"{VAL_SAVE_PATH}{row.plali_image_id}.{file_extension}"
        is_val.append(True)
    else:
        local_path = f"{TRAIN_SAVE_PATH}{row.plali_image_id}.{file_extension}"
        is_val.append(False)
    bucket = remote_path.split('/')[2]
    key = '/'.join(remote_path.split('/')[3:])
    local_paths.append(local_path)
    # check if the file already exists
    if os.path.exists(local_path):
        continue
    # download the file, catch if the file is not found
    try:
        s3.download_file(bucket, key, local_path)
    except Exception as e:
        not_found.append(row.plali_image_id)
        print(f"{remote_path}\n{e}")
        print(f"{bucket}/{key}")
df_hr_dataset['local_path'] = local_paths
df_hr_dataset['is_val_image'] = is_val
# drop the rows where the image was not found
df_hr_dataset_found = df_hr_dataset[~df_hr_dataset['plali_image_id'].isin(not_found)]
df_hr_dataset_found = df_hr_dataset_found.reset_index(drop=True).copy()


 30%|█████████████████████████████████████████████████▎                                                                                                                 | 4722/15618 [00:01<00:02, 4351.59it/s]

s3://aquabyte-research/pwais/mft-pg/datasets_s3/high_recall_fish1/images/environment=production/site-id=39/pen-id=56/date=2020-07-08/hour=12/at=2020-07-08T12:56:46.231615000Z/left_frame.resize_512_512.jpg
An error occurred (404) when calling the HeadObject operation: Not Found
aquabyte-research/pwais/mft-pg/datasets_s3/high_recall_fish1/images/environment=production/site-id=39/pen-id=56/date=2020-07-08/hour=12/at=2020-07-08T12:56:46.231615000Z/left_frame.resize_512_512.jpg


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15618/15618 [00:01<00:00, 8231.40it/s]


In [8]:
# loop through the images and get the image height and width
image_heights = []
image_widths = []
for i, row in tqdm(df_hr_dataset_found.iterrows(), total=df_hr_dataset_found.shape[0]):
    local_path = row['local_path']
    im = Image.open(local_path)
    image_heights.append(im.height)
    image_widths.append(im.width)
df_hr_dataset_found['image_height'] = image_heights
df_hr_dataset_found['image_width'] = image_widths


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15617/15617 [00:03<00:00, 4187.93it/s]


In [9]:
# loop through df_hr_dataset_found.annotations and check each annotation has a label, width, height, xCrop, and yCrop, if there is a missing value, drop the dict
def clean_annotations(annotations:list):
    for annotation in annotations:
        if any([annotation.get('label') is None, annotation.get('width') is None, annotation.get('height') is None, annotation.get('xCrop') is None, annotation.get('yCrop') is None]):
            print(f"dropping annotation {annotation} from {annotations} because it is missing a value")
            time.sleep(1)
            annotations.remove(annotation)
    return annotations

df_hr_dataset_found['annotation'] = df_hr_dataset_found['annotation'].apply(clean_annotations)
#save the dataframe to a csv file
df_hr_dataset_found.to_csv(f"{ROOT_DIR}hrf_with_keypoint_visibility_dataset.csv", index=False)

dropping annotation {'category': 'fish_visibility', 'label': 'LOW'} from [{'category': 'fish_visibility', 'height': 1502, 'label': 'PARTIAL', 'width': 1260, 'xCrop': 2836, 'yCrop': 805}, {'category': 'fish_visibility', 'height': 717, 'label': 'MEDIUM', 'width': 1919, 'xCrop': 535, 'yCrop': 585}, {'category': 'fish_visibility', 'height': 681, 'label': 'LOW', 'width': 1544, 'xCrop': 82, 'yCrop': 1209}, {'category': 'fish_visibility', 'height': 612, 'label': 'PARTIAL', 'width': 1370, 'xCrop': 2562, 'yCrop': 1833}, {'category': 'fish_visibility', 'height': 526, 'label': 'PARTIAL', 'width': 1192, 'xCrop': 1126, 'yCrop': 0}, {'category': 'fish_visibility', 'height': 357, 'label': 'PARTIAL', 'width': 743, 'xCrop': 234, 'yCrop': 1093}, {'category': 'fish_visibility', 'height': 363, 'label': 'PARTIAL', 'width': 701, 'xCrop': 2538, 'yCrop': 838}, {'category': 'fish_visibility', 'height': 299, 'label': 'PARTIAL', 'width': 733, 'xCrop': 948, 'yCrop': 362}, {'category': 'fish_visibility', 'height':

In [10]:
# adapted from https://github.com/aquabyte-new/research-exploration/blob/master/pwais/mft-pg/mft_utils/coco_dataset.py
annos_val = []
images_val = []
annotations = []
images = []
val_images = []

for _, row in df_hr_dataset_found.iterrows():
    is_val_image = row.is_val_image
    # coco image index is often 1 indexed https://docs.voxel51.com/api/fiftyone.utils.coco.html#fiftyone.utils.coco.add_coco_labels
    img_idx = len(val_images)+1 if is_val_image else len(images)+1
    for bbox in row.annotation:
        anno_id = len(annos_val)+1 if is_val_image else len(annotations)+1
        category_id = int(category_to_id[bbox['label']])
        bbox_x, bbox_y, bbox_w, bbox_h = bbox['xCrop'], bbox['yCrop'], bbox['width'], bbox['height']
        anno = {
            "id": anno_id,
            "image_id": img_idx,
            "category_id": category_id,
            "bbox": [bbox_x, bbox_y, bbox_w, bbox_h],
            # "keypoints": [],
            # "num_keypoints": 0,
            "score": -1,
            "area": bbox_w * bbox_h,
            "iscrowd": 0,
        }
        if is_val_image:
            annos_val.append(anno)
        else:
            annotations.append(anno)

    img_path = row.local_path
    img_fname = os.path.basename(img_path)

    image = {
        "id": img_idx,
        "file_name": img_fname,
        "height": row.image_height,
        "width": row.image_width,
        "source_name": row["name"],
    }
    if is_val_image:
        val_images.append(image)
    else:
        images.append(image)
            
categories = [
{'id': int(id_), 'name': name}
for name, id_ in category_to_id.items()
]

train_json = {
'categories': categories,
'images': images,
'annotations': annotations,
}

val_json = {
'categories': categories,
'images': val_images,
'annotations': annos_val,
}

with open(f"{ANNOTATION_SAVE_PATH}train.json", "w") as f:
    json.dump(train_json, f)

with open(f"{ANNOTATION_SAVE_PATH}val.json", "w") as f:
    json.dump(val_json, f)