In [1]:
import numpy as np
import pandas as pd
import json
import os
from PIL import Image

In [192]:
df = pd.read_csv('./datasets/train-all.csv', index_col="Unnamed: 0")
df = df.rename(columns={'INPUT:image': 'image', 'OUTPUT:path': 'count'})

In [193]:
df['count'] = df['count'].apply(lambda x: x.count('shape'))
df['image'] = df['image'].apply(lambda x: os.path.split(x)[-1])

In [185]:
df

Unnamed: 0,image,count,rle
0,DJI_0037-400.jpg,1,1942571 3 1945269 9 1947967 15 1950665 21 1953...
1,DJI_0080-1300.jpg,1,1504852 3 1507554 9 1510257 15 1512959 21 1515...
2,DJI_0075-1000.jpg,1,3120742 2 3123444 6 3126147 10 3128850 13 3131...
3,DJI_0018-1075.jpg,1,841684 2 844385 7 847086 12 849787 17 852488 2...
4,DJI_0071-2675.jpg,2,36947 5 39641 10 42335 12 45030 13 47724 15 50...
...,...,...,...
5242,DJI_0071-2175.jpg,1,1942439 4 1945137 11 1947835 18 1950533 25 195...
5244,DJI_0037-0.jpg,1,1047560 1 1050263 3 1052966 5 1055669 7 105837...
5246,DJI_0049-1975.jpg,1,2651023 19 2653720 42 2656418 49 2659115 58 26...
5247,DJI_0083-900.jpg,1,1875487 1 1878189 3 1880891 6 1880933 2 188359...


In [38]:
def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

In [117]:
def rle_encode(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = img.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[:-1:2]
    
    return ' '.join(str(x) for x in runs)

In [194]:
duplicates = df.groupby(df['image']).size()
duplicates = duplicates[duplicates > 1]

In [108]:
duplicates

image
DJI_0008-0.jpg       2
DJI_0008-100.jpg     2
DJI_0008-1075.jpg    3
DJI_0008-1100.jpg    3
DJI_0008-1125.jpg    3
                    ..
DJI_0265-675.jpg     2
DJI_0265-725.jpg     2
DJI_0265-75.jpg      2
DJI_0265-750.jpg     2
DJI_0265-775.jpg     2
Length: 1340, dtype: int64

In [186]:
image_folder = './datasets/train/train/'

def replace_duplicates(df, rows):
    df = df.drop_duplicates(subset='image', keep=False)
    
    for row in rows:
        df = df.append(row, ignore_index=True)
    
    return df
        
def get_resolution(image_path: str):
    with Image.open(image_folder + image_path) as img:
        width, height = img.size
    return (height, width)

In [191]:
def zip_duplicate_groups(groups):
    unified = []
        
    for image in groups.index:
        dup = df[df['image'] == image]
        height, width = get_resolution(image)
        complete_mask = np.zeros((height, width), dtype=np.int32)

        for i, rle in enumerate(dup['rle'], start=1):
            mask = rle_decode(rle, (height, width))
            complete_mask = complete_mask + mask
        
        complete_rle = rle_encode(complete_mask)
        unified.append({'image': image, 'count': i, 'rle': complete_rle})
    
    return unified

In [195]:
deduplicated = zip_duplicate_groups(duplicates)

In [196]:
df = replace_duplicates(df, deduplicated)

In [197]:
reduplicates = df.groupby(df['image']).size()
reduplicates = reduplicates[reduplicates > 1]

In [198]:
reduplicates


Series([], dtype: int64)

In [199]:
df

Unnamed: 0,image,count,rle
0,DJI_0034-1125.jpg,1,71441 5 74146 13 76850 22 79555 30 82259 39 84...
1,DJI_0034-3225.jpg,1,1009140 1 1011842 4 1014544 7 1017247 9 101994...
2,DJI_0013-750.jpg,1,2624555 5 2628395 14 2632234 24 2636074 33 263...
3,DJI_0071-1275.jpg,2,1423967 1 1426669 3 1429371 6 1432074 8 143477...
4,DJI_0075-200.jpg,1,1593618 3 1596316 9 1599015 15 1601714 21 1604...
...,...,...,...
1719,DJI_0265-675.jpg,2,4349894 2 4353731 4 4357568 6 4361405 8 436524...
1720,DJI_0265-725.jpg,2,4367901 3 4371741 7 4375580 12 4379420 17 4383...
1721,DJI_0265-75.jpg,2,4145490 2 4149330 6 4153170 10 4157010 14 4160...
1722,DJI_0265-750.jpg,2,3446320 2 3450158 6 3453996 10 3457835 14 3461...


In [201]:
df.to_csv('./datasets/merged-train-all.csv')