# COCO Dataset Generator

In [None]:
# Mounting Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Specify YOUR working directory:
main_dir = 'YOUR_MAIN_DIR' # "/content/drive/MyDrive/Deep_Learning_Itay_Sagi/Project/Sartorius_Cell_Instance_Segmentation"

In [None]:
import sys
import os

if os.path.exists(main_dir):
  data_dir = f'{main_dir}/data/'
  semi_data_dir = f'{data_dir}train_semi_supervised/'
  sys.path.append(main_dir)
else:
  print('Could not locate drive content directory')



# importing libraries


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm
from datetime import datetime
import json,itertools
from typing import Optional
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import cv2, random


# Train:

In [None]:
train_df = pd.read_csv(data_dir+'train.csv')
print(train_df.shape)
train_df.head(4)

(73585, 9)


Unnamed: 0,id,annotation,width,height,cell_type,plate_time,sample_date,sample_id,elapsed_timedelta
0,0030fd0e6378,118145 6 118849 7 119553 8 120257 8 120961 9 1...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3,0 days 11:30:00
1,0030fd0e6378,189036 1 189739 3 190441 6 191144 7 191848 8 1...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3,0 days 11:30:00
2,0030fd0e6378,173567 3 174270 5 174974 5 175678 6 176382 7 1...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3,0 days 11:30:00
3,0030fd0e6378,196723 4 197427 6 198130 7 198834 8 199538 8 2...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3,0 days 11:30:00


In [None]:

def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction

# From https://newbedev.com/encode-numpy-array-using-uncompressed-rle-for-coco-dataset
def binary_mask_to_rle(binary_mask):
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(itertools.groupby(binary_mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

In [None]:
def coco_structure(train_df):
    cat_ids = {name:id+1 for id, name in enumerate(train_df.cell_type.unique())}    
    cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
    images = [{'id':id, 'width':row.width, 'height':row.height, 'file_name':f'train/{id}.png'} for id,row in train_df.groupby('id').agg('first').iterrows()]
    annotations=[]
    for idx, row in tqdm(train_df.iterrows()):
        mk = rle_decode(row.annotation, (row.height, row.width))
        ys, xs = np.where(mk)
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        enc =binary_mask_to_rle(mk)
        seg = {
            'segmentation':enc, 
            'bbox': [int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)],
            'area': int(np.sum(mk)),
            'image_id':row.id, 
            'category_id':cat_ids[row.cell_type], 
            'iscrowd':0, 
            'id':idx
        }
        annotations.append(seg)
    return {'categories':cats, 'images':images,'annotations':annotations}

In [None]:
train_df = pd.read_csv(data_dir+'train.csv')
train_meta = train_df.groupby('id').first().reset_index()

In [None]:
n_splits=20
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
for fold, (_, val_idx) in enumerate(skf.split(X=train_meta, y=train_meta['cell_type']), 1):
    train_meta.loc[val_idx, 'fold'] = fold
    
train_meta['fold'] = train_meta['fold'].astype(np.uint8)
train_meta.groupby('fold').size()

fold
1     61
2     61
3     61
4     61
5     61
6     61
7     60
8     60
9     60
10    60
dtype: int64

In [None]:
val_selected=1     # 95/5 split
test_selected=0
train_selected = range(2,21)

train_ids = train_meta[train_meta["fold"].isin(train_selected)].id
val_ids = train_meta[train_meta["fold"]==val_selected].id
test_ids = train_meta[train_meta["fold"]==test_selected].id

df_train = train_df[train_df.id.isin(train_ids)]
df_valid = train_df[train_df.id.isin(val_ids)]
df_test = train_df[train_df.id.isin(test_ids)]

train_json = coco_structure(df_train)
valid_json = coco_structure(df_valid)
test_json = coco_structure(df_test)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
df_test.groupby('cell_type').size()

Series([], dtype: int64)

In [None]:
df_valid.groupby('cell_type').size()

cell_type
astro     1120
cort       988
shsy5y    5191
dtype: int64

In [None]:
df_train.groupby('cell_type').size()

cell_type
astro      9402
cort       9789
shsy5y    47095
dtype: int64

In [None]:
with open(data_dir+"coco_train_95_5.json", 'w', encoding='utf-8') as f:
    json.dump(train_json, f, ensure_ascii=True, indent=4)

with open(data_dir+"coco_val_95_5.json", 'w', encoding='utf-8') as f:
    json.dump(valid_json, f, ensure_ascii=True, indent=4)



# Semi:

In [None]:
 def semi_coco_structure(semi_data_dir):

    ## categories:
    cats =  [{"id": 1, "name": "shsy5y", "supercategory": "none"}, {"id": 2, "name": "cort", "supercategory": "none"}, {"id": 3, "name": "astro", "supercategory": "none"}]

    ## images:
    width = 704
    height = 520
    files = os.listdir(semi_data_dir)
    labels = ['shsy5y', 'astro', 'cort']

    id_c=0
    images=[]
    for file in files:
          image_path = os.path.join(semi_data_dir, file)
          label = file.split("[")[0]
          if label == 'astros': 
              label = 'astro'
          images.append({'id':id_c, 'width':width, 'height':height, 'file_name':image_path})
          id_c+=1
        
    return {'categories':cats, 'images':images}

In [None]:
# Converting dataset dataframe to COCO structure
semi_root = semi_coco_structure(semi_data_dir)

# Exmaple
semi_root['images'][0]

In [None]:
# Saving COCO structure dataset in json file
semi_out_file = open(data_dir+"semi_annot.json", "w", encoding='utf-8')
json.dump(semi_root, semi_out_file, ensure_ascii=True, indent=4)
semi_out_file.close() 