In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import pandas as pd
from collections import namedtuple, defaultdict
import os
import pickle
import sys
import operator
import glob
import csv 
from math import sqrt

import PIL
from PIL import Image, ImageDraw, ImageFilter

import skimage
import skimage.io
import skimage.measure

import shapely
import shapely.geometry
from shapely.geometry import Polygon
%matplotlib inline

In [2]:
SOURCEDIR = os.path.join('../data/','sealion')

DATADIR = '.'

VERBOSITY = namedtuple('VERBOSITY', ['QUITE', 'NORMAL', 'VERBOSE', 'DEBUG'])(0,1,2,3)

In [3]:
SeaLionCoord = namedtuple('SeaLionCoord', ['tid', 'cls', 'x', 'y'])

In [4]:
class SeaLionData(object):
    
    def __init__(self, sourcedir=SOURCEDIR, datadir=DATADIR, verbosity=VERBOSITY.NORMAL):
        self.sourcedir = sourcedir
        self.datadir = datadir
        self.verbosity = verbosity
        
        self.cls_nb = 5
        
        self.cls_names = (
            'adult_males',
            'subadult_males',
            'adult_females',
            'juveniles',
            'pups',
            'NOT_A_SEA_LION')
            
        self.cls = namedtuple('ClassIndex', self.cls_names)(*range(0,6))
    
        # backported from @bitsofbits. Average actual color of dot centers.
        self.cls_colors = (
            (243,8,5),          # red
            (244,8,242),        # magenta
            (87,46,10),         # brown 
            (25,56,176),        # blue
            (38,174,21),        # green
            )
    
            
        self.dot_radius = 3
        
        self.train_nb = 947
        
        self.test_nb = 18636
       
        self.paths = {
            # Source paths
            'sample'     : os.path.join(sourcedir, 'sample_submission.csv'),
            'counts'     : os.path.join(sourcedir, 'Train', 'train.csv'),
            'train'      : os.path.join(sourcedir, 'Train', '{tid}.jpg'),
            'dotted'     : os.path.join(sourcedir, 'TrainDotted', '{tid}.jpg'),
            'test'       : os.path.join(sourcedir, 'Test', '{tid}.jpg'),
            # Data paths
            'coords'     : os.path.join(datadir, 'coords.csv'),  
            }
        
        # From MismatchedTrainImages.txt
        self.bad_train_ids = (
            3, 7, 9, 21, 30, 34, 71, 81, 89, 97, 151, 184, 215, 234, 242, 
            268, 290, 311, 331, 344, 380, 384, 406, 421, 469, 475, 490, 499, 
            507, 530, 531, 605, 607, 614, 621, 638, 644, 687, 712, 721, 767, 
            779, 781, 794, 800, 811, 839, 840, 869, 882, 901, 903, 905, 909, 
            913, 927, 946)
            
        self._counts = None

        
    @property
    def trainshort_ids(self):
        return (0,1,2,4,5,6,8,10)  # Trainshort1
        #return range(41,51)         # Trainshort2
        
    @property 
    def train_ids(self):
        """List of all valid train ids"""
        tids = range(0, self.train_nb)
        tids = list(set(tids) - set(self.bad_train_ids) )  # Remove bad ids
        tids.sort()
        return tids
                    
    @property 
    def test_ids(self):
        return range(0, self.test_nb)
    
    def path(self, name, **kwargs):
        """Return path to various source files"""
        path = self.paths[name].format(**kwargs)
        return path        

    @property
    def counts(self) :
        """A map from train_id to list of sea lion class counts"""
        if self._counts is None :
            counts = {}
            fn = self.path('counts')
            with open(fn) as f:
                f.readline()
                for line in f:
                    tid_counts = list(map(int, line.split(',')))
                    counts[tid_counts[0]] = tid_counts[1:]
            self._counts = counts
        return self._counts

    def rmse(self, tid_counts) :
        true_counts = self.counts
        
        error = np.zeros(shape=[5] )
        
        for tid in tid_counts:
            true_counts = self.counts[tid]
            obs_counts = tid_counts[tid]
            diff = np.asarray(true_counts) - np.asarray(obs_counts)
            error += diff*diff
        #print(error)
        error /= len(tid_counts)
        rmse = np.sqrt(error).sum() / 5
        return rmse 
        

    def load_train_image(self, train_id, border=0, mask=False):
        """Return image as numpy array
         
        border -- add a black border of this width around image
        mask -- If true mask out masked areas from corresponding dotted image
        """
        img = self._load_image('train', train_id, border)
        if mask :
            # The masked areas are not uniformly black, presumable due to 
            # jpeg compression artifacts
            dot_img = self._load_image('dotted', train_id, border).astype(np.uint16).sum(axis=-1)
            img = np.copy(img)
            img[dot_img<40] = 0
        return img
   

    def load_dotted_image(self, train_id, border=0):
        return self._load_image('dotted', train_id, border)
 
 
    def load_test_image(self, test_id, border=0):    
        return self._load_image('test', test_id, border)


    def _load_image(self, itype, tid, border=0) :
        fn = self.path(itype, tid=tid)
        img = np.asarray(Image.open(fn))
        if border :
            height, width, channels = img.shape
            bimg = np.zeros( shape=(height+border*2, width+border*2, channels), dtype=np.uint8)
            bimg[border:-border, border:-border, :] = img
            img = bimg
        return img
    

    def coords(self, train_id):
        """Extract coordinates of dotted sealions and return list of SeaLionCoord objects)"""
        
        # Empirical constants
        MIN_DIFFERENCE = 16
        MIN_AREA = 9
        MAX_AREA = 100
        MAX_AVG_DIFF = 50
        MAX_COLOR_DIFF = 32
       
        src_img = np.asarray(self.load_train_image(train_id, mask=True), dtype = np.float)
        dot_img = np.asarray(self.load_dotted_image(train_id), dtype = np.float)

        img_diff = np.abs(src_img-dot_img)
        
        # Detect bad data. If train and dotted images are very different then somethings wrong.
        avg_diff = img_diff.sum() / (img_diff.shape[0] * img_diff.shape[1])
        if avg_diff > MAX_AVG_DIFF: return {}
        
        img_diff = np.max(img_diff, axis=-1)   
           
        img_diff[img_diff<MIN_DIFFERENCE] = 0
        img_diff[img_diff>=MIN_DIFFERENCE] = 255

        sealions = []
        
        for cls, color in enumerate(self.cls_colors):
            # color search backported from @bitsofbits.
            color_array = np.array(color)[None, None, :]
            has_color = np.sqrt(np.sum(np.square(dot_img * (img_diff > 0)[:,:,None] - color_array), axis=-1)) < MAX_COLOR_DIFF 
            contours = skimage.measure.find_contours(has_color.astype(float), 0.5)
            
            if self.verbosity == VERBOSITY.DEBUG :
                print()
                fn = 'diff_{}_{}.png'.format(train_id,cls)
                print('Saving train/dotted difference: {}'.format(fn))
                Image.fromarray((has_color*255).astype(np.uint8)).save(fn)

            for cnt in contours :
                p = Polygon(shell=cnt)
                area = p.area 
                if(area > MIN_AREA and area < MAX_AREA) :
                    y, x= p.centroid.coords[0] # DANGER : skimage and cv2 coordinates transposed?
                    x = int(round(x))
                    y = int(round(y))
                    sealions.append( SeaLionCoord(train_id, cls, x, y) )
                
        if self.verbosity >= VERBOSITY.VERBOSE :
            counts = [0,0,0,0,0]
            for c in sealions :
                counts[c.cls] +=1
            print()
            print('train_id','true_counts','counted_dots', 'difference', sep='\t')   
            true_counts = self.counts[train_id]
            print(train_id, true_counts, counts, np.array(true_counts) - np.array(counts) , sep='\t' )
          
        if self.verbosity == VERBOSITY.DEBUG :
            img = np.copy(sld.load_dotted_image(train_id))
            r = self.dot_radius
            dy,dx,c = img.shape
            for tid, cls, cx, cy in sealions :                    
                for x in range(cx-r, cx+r+1) : img[cy, x, :] = 255
                for y in range(cy-r, cy+r+1) : img[y, cx, :] = 255    
            fn = 'cross_{}.png'.format(train_id)
            print('Saving crossed dots: {}'.format(fn))
            Image.fromarray(img).save(fn)
     
        return self.convert(sealions)
        

    def save_coords(self, train_ids=None): 
        if train_ids is None: train_ids = self.train_ids
        fn = self.path('coords')
        self._progress('Saving sealion coordinates to {}'.format(fn))
        with open(fn, 'w') as csvfile:
            writer =csv.writer(csvfile)
            writer.writerow( SeaLionCoord._fields )
            for tid in train_ids :
                self._progress()
                for coord in self.coords(tid):
                    writer.writerow(coord)
        self._progress('done')
        
    def load_coords(self):
        fn = self.path('coords')
        self._progress('Loading sea lion coordinates from {}'.format(fn))
        with open(fn) as f:
            f.readline()
            return [SeaLionCoord(*[int(n) for n in line.split(',')]) for line in f]

    
            
    def save_sea_lion_chunks(self, coords, chunksize=128):
        self._progress('Saving image chunks...')
        self._progress('\n', verbosity=VERBOSITY.VERBOSE)
        
        last_tid = -1
        
        for tid, cls, x, y in coords :
            if tid != last_tid:
                img = self.load_train_image(tid, border=chunksize//2, mask=True)
                last_tid = tid

            fn = 'chunk_{tid}_{cls}_{x}_{y}_{size}.png'.format(size=chunksize, tid=tid, cls=cls, x=x, y=y)
            self._progress(' Saving '+fn, end='\n', verbosity=VERBOSITY.VERBOSE)
            Image.fromarray( img[y:y+chunksize, x:x+chunksize, :]).save(fn)
            self._progress()
        self._progress('done')
        
            
    def _progress(self, string=None, end=' ', verbosity=VERBOSITY.NORMAL):
        if self.verbosity < verbosity: return
        if not string :
            print('.', end='')
        elif string == 'done':
            print(' done') 
        else:
            print(string, end=end)
        sys.stdout.flush()

    def convert(self, slcoord):
        dots = defaultdict(list)
        for c in slcoord:
            name = self.cls_names[c.cls]
            dots[name].append((c.x, c.y))
        return dots
# end SeaLionData


    red: adult males
    magenta: subadult males
    brown: adult females
    blue: juveniles
    green: pups


In [5]:
HSVColor = namedtuple("HSVColor", ["hue_offset", "hue_min", "hue_max", "sat_min", "sat_max", "val_min", "val_max"])

In [6]:
sealion_types = ["adult_males", 
    "subadult_males",
    "adult_females",
    "juveniles",
    "pups"]

In [7]:
dot_colors = {
    "adult_males": HSVColor(90, 80, 100, 210, 255, 160, 255),     # red
    "subadult_males": HSVColor(0, 145, 155, 230, 255, 220, 255),  # magenta
    "adult_females": HSVColor(0, 7, 23, 130, 255, 50, 105),      # brown
    "juveniles": HSVColor(0, 103, 123, 155, 240, 115, 200),       # blue
    "pups": HSVColor(0, 47, 67, 200, 255, 148, 200)               # green
}

In [8]:
def get_diff_mask(im_bgr, im_dotted_bgr):
    """ Return a boolean mask with the difference between the dotted and non-dotted image.
    """
    diff_dotted_bgr = cv2.absdiff(im_bgr, im_dotted_bgr)
    diff_dotted_gray = np.max(diff_dotted_bgr, axis=2)
    
    # Find the best threshold: check how many pixels turn white for the next threshold.
    variation_by_threshold = np.diff(np.cumsum(np.bincount(diff_dotted_gray.ravel())), 1)
    
    threshold_diff = 50
    for i in range(variation_by_threshold.shape[0] - 1, 0, -1):
        if variation_by_threshold[i] > 110:
            threshold_diff = i
            break

    diff_dotted_mask = diff_dotted_gray > threshold_diff
    return diff_dotted_mask, diff_dotted_gray

In [9]:
def detect_dots(im_bgr, im_dotted_bgr, mask_u8, dot_colors, train_id = 0):    
    # Erode mask to avoid issue on the edges of the mask
    radius = 5
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (radius, radius))
    mask_eroded_u8 = cv2.erode(mask_u8, kernel)
    _, mask_eroded_u8 = cv2.threshold(mask_eroded_u8, 1, 255, cv2.THRESH_BINARY)
    
    # Apply mask on both image
    im_bgr_masked = cv2.bitwise_and(im_bgr, im_bgr, mask=mask_eroded_u8)
    im_dotted_bgr_masked = cv2.bitwise_and(im_dotted_bgr, im_dotted_bgr, mask=mask_eroded_u8)
    
    # Compute the diff between dotted and non-dotted images. 
    diff_dotted_mask, diff_dotted_gray = get_diff_mask(im_bgr_masked, im_dotted_bgr_masked)
    diff_dotted_mask_u8 = (diff_dotted_mask * 255).astype(np.uint8)
    total_diff = np.sum(diff_dotted_mask_u8)
    if total_diff > 2000 * 40 * 40:
        print("[{i}] - Error in diff image".format(i=train_id))
        return {}, diff_dotted_mask_u8, diff_dotted_gray
    
    # Find the dot for each type of sealion.
    n_labels, im_labels, stats, centroids = cv2.connectedComponentsWithStats(diff_dotted_mask_u8, connectivity=8)

    min_size = 3*3
    max_size = 15*15
    ratio_min = 0.5
    ratio_max = 2.0
    dots = defaultdict(list)
    for blob_id, stat in enumerate(stats):
        ratio = stat[2]/(1.0 * stat[3] + 1)
        if min_size < stat[4] < max_size and ratio_min < ratio < ratio_max:
            half_size = 1
            center = int(round(centroids[blob_id][0])), int(round(centroids[blob_id][1]))
            if center[0] < half_size or center[1] < half_size or center[0] > im_dotted_bgr.shape[1] - half_size or center[1] > im_dotted_bgr.shape[0] - half_size:
                print("Error coord")
                continue
            dot_bgr = im_dotted_bgr[center[1] - half_size:center[1] + half_size + 1,
                                    center[0] - half_size:center[0] + half_size + 1, :]
            dot_hsv = cv2.cvtColor(dot_bgr, cv2.COLOR_BGR2HSV)
            hue = dot_hsv[:,:,0]
            sat = dot_hsv[:,:,1]
            val = dot_hsv[:,:,2]
            sat_median = np.median(sat)
            val_median = np.median(val)
            n_nn = 0
            for sealion, color in dot_colors.items():
                hue_off = (hue.astype(np.uint16) + color.hue_offset) % 180
                hue_median = np.median(hue_off)
                if color.hue_min <= hue_median <= color.hue_max and \
                    color.val_min <= val_median <= color.val_max and \
                    color.sat_min <= sat_median <= color.sat_max:
                        dots[sealion].append(centroids[blob_id])
                else:
                    n_nn += 1
            if n_nn == len(dot_colors):
                continue
                #print("Not found: ", train_id, center, hue, np.median(hue), sat_median, val_median)
                    

    return dots, diff_dotted_mask_u8, diff_dotted_gray

In [10]:
def draw_detected(im, dots):
    im_draw = im.copy()
    n = 5
    colors = plt.cm.rainbow(np.linspace(0, 1, n))
    colors_rgb = [(int(c[0]), int(c[1]), int(c[2])) for c in colors * 255]
    for i, (sealion, centroids) in enumerate(dots.items()):
        for c in centroids:
            center = (int(round(c[0])), int(round(c[1])))
            cv2.circle(im_draw, center, 9, colors_rgb[i], 1)
            cv2.circle(im_draw, center, 1, colors_rgb[i], -1)
    return im_draw

In [11]:
def print_numbers(train_id, dots):
    ns = []
    for sealion in sealion_types:
        n = 0 if sealion not in dots else len(dots[sealion])
        ns.append(n)
    print("[{i}] - {ns}".format(i=train_id, ns=ns))

In [12]:
def get_train_id(filename):
    directory, basename = os.path.split(filename)
    train_id, ext = os.path.splitext(basename)
    return int(train_id)

In [13]:
def merge(dotsA, dotsB, errorsA, errorsB):
    mergedDots = dict()
    mergedErrors = []
    for i, sealion in enumerate(sealion_types): 
        if np.abs(errorsA[i]) <= np.abs(errorsB[i]):
            if sealion in dotsA:
                mergedDots[sealion] = dotsA[sealion]
                mergedErrors.append(errorsA[i])
        else:
            mergedDots[sealion] = dotsB[sealion]
            mergedErrors.append(errorsB[i])
    return mergedDots, mergedErrors

In [14]:
def getErrors(dots, grountruth):
    errors = []
    for sealion in sealion_types:
        gt = grountruth[sealion].get_values()[0]
        detected = 0 if sealion not in dots else len(dots[sealion])
        errors.append(detected - gt)
    return errors

In [15]:
def compare(dotsA, dotsB):
    differences = []
    for k in sealion_types:
        if k in dotsA:
            if k not in dotsB:
                differences.append(len(dotsA[k]))
            else:
                differences.append(compareCoords(dotsA[k], dotsB[k]))
        elif k in dotsB:
            differences.append(len(dotsB[k]))
        else:
            differences.append(0)
    return differences

In [16]:
def compareCoords(coordsA, coordsB):
    threshold = 3*3+3*3
    distances = np.zeros((len(coordsA), len(coordsB)))
    for a, cA in enumerate(coordsA):
        for b, cB in enumerate(coordsB):
            dx = cA[0] - cB[0]
            dy = cA[1] - cB[1]
            distances[a,b] = dx*dx + dy*dy
    
    mi = min(len(coordsA), len(coordsB))
    ma = max(len(coordsA), len(coordsB))
    
    mapping = dict()
    for i in range(mi):
        best = np.argmin(distances.ravel())
        best_row = best // len(coordsB)
        best_col = best % len(coordsB)
        if distances[best_row, best_col] < threshold:
            mapping[best_col] = best_row
            distances[best_row, :] = 99999
            distances[:, best_col] = 99999
        else:
            break
    
    matched = len(mapping)
    unmatched = ma - matched
    return unmatched

In [17]:
a = [(0, 0), (10, 10), (20, 20), (30, 30)]
b = [(11, 11), (1, 1), (32, 28), (20, 19)]
c = [(15, 5), (1, 1), (32, 24), (20, 19), (550, 54)]

In [18]:
compareCoords(a, a)

0

In [19]:
compareCoords(a, b)

0

In [20]:
compareCoords(a, c)

3

In [21]:
def process_all(train_ids=None):
    train_dir = "/home/lowik/sealion/data/sealion/Train/"
    dotted_dir = "/home/lowik/sealion/data/sealion/TrainDotted/"
    mask_dir = "/home/lowik/sealion/data/sealion/TrainMask/"
    debug_dir = "/home/lowik/sealion/data/sealion/TrainDebug/"
    dots_dir = "/home/lowik/sealion/data/sealion/TrainDots/"
    os.makedirs(debug_dir, exist_ok=True)
    os.makedirs(dots_dir, exist_ok=True)
    df = pd.read_csv(os.path.join(train_dir, "train.csv"))
    train_id_with_errors = []
    train_id_no_diff = []
    train_id_with_diff = []
    train_id_ok = []
    files_to_process = [filename for filename in os.listdir(train_dir) if filename.endswith("jpg")]
    sld = SeaLionData()
    for filename in files_to_process:
        train_id = get_train_id(filename)
        if train_ids is None or train_id in train_ids:
            im_bgr = cv2.imread(os.path.join(train_dir, filename))
            im_dotted_bgr = cv2.imread(os.path.join(dotted_dir, filename))
            im_mask = cv2.imread(os.path.join(mask_dir, filename), cv2.IMREAD_GRAYSCALE)
            
            dotsA, diff_dotted_mask_u8, diff_dotted_gray = detect_dots(im_bgr, im_dotted_bgr, im_mask, dot_colors, train_id)
            dotsB = sld.coords(train_id)
            
            with open(os.path.join(dots_dir, str(train_id) + "A.pkl"), "wb") as ofile:
                pickle.dump(dotsA, ofile, pickle.HIGHEST_PROTOCOL)
                
            with open(os.path.join(dots_dir, str(train_id) + "B.pkl"), "wb") as ofile:
                pickle.dump(dotsB, ofile, pickle.HIGHEST_PROTOCOL)
            
            grountruth = df[df.train_id==int(train_id)]
            errorsA = getErrors(dotsA, grountruth)
            errorsB = getErrors(dotsB, grountruth)
            
            difference = compare(dotsA, dotsB)
            if np.sum(np.abs(difference)) == 0:
                print("[{i}] - No Difference A vs B".format(i=train_id))
                train_id_no_diff.append(train_id)
            else:
                print("[{i}] - Difference A vs B: {d}".format(i=train_id, d=difference))
                train_id_with_diff.append(train_id)
            
            mergeDots, mergeErrors = merge(dotsA, dotsB, errorsA, errorsB)
            sumErrors = np.sum(np.abs(mergeErrors))
            
            im_draw = draw_detected(im_dotted_bgr, mergeDots)
            cv2.imwrite(os.path.join(debug_dir, str(train_id) + ".jpg"), im_draw)
            if sumErrors > 0:
                train_id_with_errors.append(train_id)
                print("[{i}] - Total errors: {total} - {err}".format(i=train_id, total=sumErrors, err=mergeErrors))
            else:
                train_id_ok.append(train_id)
                print("[{i}] - ok".format(i=train_id))

            # Save dots
            with open(os.path.join(dots_dir, str(train_id) + ".pkl"), "wb") as ofile:
                pickle.dump(mergeDots, ofile, pickle.HIGHEST_PROTOCOL)
    return train_id_with_errors, train_id_ok, train_id_no_diff, train_id_with_diff

In [None]:
train_id_with_errors, train_id_ok, train_id_no_diff, train_id_with_diff = process_all()

In [22]:
def reprocess_all(train_ids=None):
    train_dir = "/home/lowik/sealion/data/sealion/Train/"
    dotted_dir = "/home/lowik/sealion/data/sealion/TrainDotted/"
    mask_dir = "/home/lowik/sealion/data/sealion/TrainMask/"
    debug_dir = "/home/lowik/sealion/data/sealion/TrainDebug/"
    dots_dir = "/home/lowik/sealion/data/sealion/TrainDots/"
    os.makedirs(debug_dir, exist_ok=True)
    os.makedirs(dots_dir, exist_ok=True)
    df = pd.read_csv(os.path.join(train_dir, "train.csv"))
    train_id_with_errors = []
    train_id_no_diff = []
    train_id_with_diff = []
    train_id_ok = []
    files_to_process = [filename for filename in os.listdir(train_dir) if filename.endswith("jpg")]
    sld = SeaLionData()
    for filename in files_to_process:
        train_id = get_train_id(filename)
        if train_ids is None or train_id in train_ids:
            #im_bgr = cv2.imread(os.path.join(train_dir, filename))
            #im_dotted_bgr = cv2.imread(os.path.join(dotted_dir, filename))
            #im_mask = cv2.imread(os.path.join(mask_dir, filename), cv2.IMREAD_GRAYSCALE)
            
            with open(os.path.join(dots_dir, str(train_id) + "A.pkl"), "rb") as ofile:
                dotsA = pickle.load(ofile)
                
            with open(os.path.join(dots_dir, str(train_id) + "B.pkl"), "rb") as ofile:
                dotsB = pickle.load(ofile)
            
            grountruth = df[df.train_id==int(train_id)]
            errorsA = getErrors(dotsA, grountruth)
            errorsB = getErrors(dotsB, grountruth)
            
            difference = compare(dotsA, dotsB)
            if np.sum(np.abs(difference)) == 0:
                print("[{i}] - No Difference A vs B".format(i=train_id))
                train_id_no_diff.append(train_id)
            else:
                print("[{i}] - Difference A vs B: {d}".format(i=train_id, d=difference))
                train_id_with_diff.append(train_id)
            
            mergeDots, mergeErrors = merge(dotsA, dotsB, errorsA, errorsB)
            sumErrors = np.sum(np.abs(mergeErrors))
            
            #im_draw = draw_detected(im_dotted_bgr, mergeDots)
            #cv2.imwrite(os.path.join(debug_dir, str(train_id) + ".jpg"), im_draw)
            if sumErrors > 0:
                train_id_with_errors.append(train_id)
                print("[{i}] - Total errors: {total} - {err}".format(i=train_id, total=sumErrors, err=mergeErrors))
            else:
                train_id_ok.append(train_id)
                print("[{i}] - ok".format(i=train_id))

            # Save dots
            #with open(os.path.join(dots_dir, str(train_id) + ".pkl"), "wb") as ofile:
            #    pickle.dump(mergeDots, ofile, pickle.HIGHEST_PROTOCOL)
    return train_id_with_errors, train_id_ok, train_id_no_diff, train_id_with_diff

In [23]:
train_id_with_errors, train_id_ok, train_id_no_diff, train_id_with_diff = reprocess_all()

[866] - No Difference A vs B
[866] - ok
[867] - No Difference A vs B
[867] - ok
[868] - No Difference A vs B
[868] - ok
[869] - Difference A vs B: [2, 3, 0, 0, 0]
[869] - ok
[87] - No Difference A vs B
[87] - Total errors: 1 - [0, 0, -1, 0]
[870] - No Difference A vs B
[870] - ok
[871] - Difference A vs B: [1, 1, 123, 16, 172]
[871] - Total errors: 7 - [0, 0, -3, 0, -4]
[872] - No Difference A vs B
[872] - ok
[873] - No Difference A vs B
[873] - Total errors: 1 - [0, 0, -1]
[874] - Difference A vs B: [0, 0, 0, 0, 1]
[874] - ok
[875] - No Difference A vs B
[875] - ok
[876] - No Difference A vs B
[876] - ok
[877] - No Difference A vs B
[877] - ok
[878] - No Difference A vs B
[878] - ok
[879] - Difference A vs B: [0, 0, 0, 2, 0]
[879] - ok
[88] - No Difference A vs B
[88] - ok
[880] - No Difference A vs B
[880] - ok
[881] - Difference A vs B: [0, 0, 97, 30, 114]
[881] - Total errors: 1 - [0, 0, 0, -1, 0]
[882] - Difference A vs B: [2, 4, 5, 16, 0]
[882] - Total errors: 1 - [0, 0, -1, 0]
[

[813] - No Difference A vs B
[813] - Total errors: 1 - [0, 0, 0, -1]
[814] - No Difference A vs B
[814] - Total errors: 1 - [0, 0, -1]
[815] - No Difference A vs B
[815] - ok
[816] - Difference A vs B: [1, 0, 1, 23, 0]
[816] - ok
[817] - No Difference A vs B
[817] - ok
[818] - No Difference A vs B
[818] - ok
[819] - No Difference A vs B
[819] - ok
[82] - No Difference A vs B
[82] - ok
[820] - No Difference A vs B
[820] - ok
[821] - No Difference A vs B
[821] - ok
[822] - Difference A vs B: [8, 2, 171, 9, 210]
[822] - Total errors: 1 - [0, 0, 0, -1, 0]
[823] - No Difference A vs B
[823] - Total errors: 3 - [0, -3, 0, 0]
[824] - No Difference A vs B
[824] - ok
[825] - No Difference A vs B
[825] - ok
[826] - No Difference A vs B
[826] - ok
[827] - No Difference A vs B
[827] - Total errors: 4 - [-4, 0, 0]
[828] - Difference A vs B: [0, 0, 2, 0, 0]
[828] - ok
[829] - Difference A vs B: [0, 0, 1, 0, 0]
[829] - ok
[83] - No Difference A vs B
[83] - Total errors: 4 - [0, 0, 1, -3]
[830] - Diff

[745] - Difference A vs B: [2, 0, 210, 16, 180]
[745] - Total errors: 1 - [0, 0, -1, 0, 0]
[746] - No Difference A vs B
[746] - ok
[747] - Difference A vs B: [8, 0, 156, 12, 223]
[747] - ok
[748] - No Difference A vs B
[748] - Total errors: 1 - [-1]
[749] - No Difference A vs B
[749] - ok
[75] - No Difference A vs B
[75] - ok
[750] - No Difference A vs B
[750] - Total errors: 4 - [-2, 0, 0, -2]
[751] - No Difference A vs B
[751] - Total errors: 3 - [-3]
[752] - No Difference A vs B
[752] - ok
[753] - No Difference A vs B
[753] - ok
[754] - No Difference A vs B
[754] - Total errors: 2 - [0, 0, -2]
[755] - No Difference A vs B
[755] - ok
[536] - No Difference A vs B
[536] - ok
[537] - Difference A vs B: [1, 0, 0, 0, 0]
[537] - ok
[538] - Difference A vs B: [3, 0, 28, 2, 98]
[538] - Total errors: 2 - [-1, -1, 0, 0, 0]
[539] - No Difference A vs B
[539] - Total errors: 1 - [-1, 0]
[54] - No Difference A vs B
[54] - ok
[540] - No Difference A vs B
[540] - ok
[541] - No Difference A vs B
[54

[453] - No Difference A vs B
[453] - ok
[454] - No Difference A vs B
[454] - ok
[455] - No Difference A vs B
[455] - ok
[456] - No Difference A vs B
[456] - ok
[457] - No Difference A vs B
[457] - ok
[458] - No Difference A vs B
[458] - ok
[459] - No Difference A vs B
[459] - ok
[46] - No Difference A vs B
[46] - ok
[460] - No Difference A vs B
[460] - Total errors: 1 - [0, 0, 0, -1]
[461] - Difference A vs B: [0, 0, 0, 2, 0]
[461] - ok
[462] - No Difference A vs B
[462] - Total errors: 2 - [-1, 0, 0, 0, -1]
[463] - No Difference A vs B
[463] - ok
[464] - No Difference A vs B
[464] - ok
[465] - Difference A vs B: [0, 0, 4, 0, 0]
[465] - Total errors: 1 - [0, 0, 0, -1]
[466] - No Difference A vs B
[466] - ok
[467] - No Difference A vs B
[467] - ok
[468] - No Difference A vs B
[468] - ok
[469] - No Difference A vs B
[469] - ok
[47] - No Difference A vs B
[47] - Total errors: 2 - [0, -2, 0, 0, 0]
[470] - No Difference A vs B
[470] - ok
[471] - No Difference A vs B
[471] - ok
[472] - No Di

[388] - Difference A vs B: [0, 0, 0, 2, 0]
[388] - Total errors: 1 - [-1, 0, 0]
[389] - No Difference A vs B
[389] - ok
[39] - No Difference A vs B
[39] - ok
[390] - No Difference A vs B
[390] - ok
[391] - No Difference A vs B
[391] - ok
[392] - No Difference A vs B
[392] - ok
[393] - No Difference A vs B
[393] - ok
[394] - No Difference A vs B
[394] - Total errors: 4 - [0, -4]
[395] - No Difference A vs B
[395] - Total errors: 3 - [0, -2, -1]
[396] - No Difference A vs B
[396] - ok
[397] - No Difference A vs B
[397] - ok
[398] - Difference A vs B: [1, 0, 0, 1, 0]
[398] - Total errors: 3 - [-3, 0]
[399] - No Difference A vs B
[399] - ok
[4] - No Difference A vs B
[4] - ok
[40] - No Difference A vs B
[40] - Total errors: 2 - [0, 0, -1, -1]
[400] - No Difference A vs B
[400] - ok
[401] - Difference A vs B: [0, 0, 0, 2, 0]
[401] - ok
[402] - Difference A vs B: [1, 3, 245, 40, 0]
[402] - ok
[403] - Difference A vs B: [0, 0, 18, 50, 0]
[403] - ok
[404] - No Difference A vs B
[404] - ok
[405

[101] - No Difference A vs B
[101] - ok
[102] - No Difference A vs B
[102] - ok
[103] - No Difference A vs B
[103] - ok
[104] - No Difference A vs B
[104] - ok
[105] - No Difference A vs B
[105] - Total errors: 5 - [0, -1, -4, 0]
[106] - No Difference A vs B
[106] - ok
[107] - No Difference A vs B
[107] - ok
[108] - No Difference A vs B
[108] - Total errors: 1 - [-1, 0, 0]
[109] - No Difference A vs B
[109] - ok
[11] - No Difference A vs B
[11] - Total errors: 2 - [0, -2, 0, 0]
[110] - No Difference A vs B
[110] - Total errors: 2 - [0, 0, 0, 0, -2]
[111] - Difference A vs B: [0, 0, 0, 0, 2]
[111] - ok
[112] - Difference A vs B: [2, 0, 336, 127, 0]
[112] - ok
[113] - No Difference A vs B
[113] - ok
[114] - No Difference A vs B
[114] - ok
[115] - No Difference A vs B
[115] - ok
[116] - No Difference A vs B
[116] - ok
[117] - No Difference A vs B
[117] - ok
[118] - No Difference A vs B
[118] - ok
[119] - Difference A vs B: [1, 3, 9, 2, 11]
[119] - ok
[12] - No Difference A vs B
[12] - ok


In [33]:
ok_or_no_diff = list(set(train_id_ok).union(set(train_id_no_diff)))

In [32]:
import json

In [34]:
with open("../data/sealion/dots_ok.json", "w") as ofile:
    ok_filenames = [str(i) + ".jpg" for i in ok_or_no_diff]
    json.dump(ok_filenames, ofile, indent=2, sort_keys=True)

In [35]:
!head ../data/sealion/dots_ok.json

[
  "0.jpg",
  "1.jpg",
  "2.jpg",
  "4.jpg",
  "5.jpg",
  "6.jpg",
  "7.jpg",
  "8.jpg",
  "10.jpg",
