# Figuring out what data we want.
The *turtle* parent category has WnID *n01662784*, see http://www.image-net.org/synset?wnid=n01662784

In [1]:
import pandas as pd

In [2]:
WnID_parent_child = pd.read_csv('./data/WnID_parent_child_relations.txt', header=None, sep=' ').rename(columns={0: 'parent', 1: 'child'})

In [3]:
turtle_children = WnID_parent_child.loc[WnID_parent_child['parent'] == 'n01662784']
turtle_grandchildren = WnID_parent_child.loc[WnID_parent_child['parent'].isin(
    turtle_children['child'])].rename(columns={'parent': 'child', 'child': 'grandchild'})
turtle_grandgrandchildren = WnID_parent_child.loc[WnID_parent_child['parent'].isin(
    turtle_grandchildren['grandchild'])].rename(columns={'parent': 'grandchild', 'child': 'grandgrandchild'})

In [4]:
print('There are no grand-grand-grand-children:')
turtle_grandgrandgrandchildren = WnID_parent_child.loc[WnID_parent_child['parent'].isin(
    turtle_grandgrandchildren['grandgrandchild'])].rename(columns={'parent': 'grandgrandchild', 'child': 'grandgrandgrandchild'})
display(turtle_grandgrandgrandchildren)

There are no grand-grand-grand-children:


Unnamed: 0,grandgrandchild,grandgrandgrandchild


In [5]:
turtle_lineage = pd.merge(pd.merge(turtle_children, turtle_grandchildren, on='child', how='left'), 
                          turtle_grandgrandchildren, on='grandchild', how='left')
display(turtle_lineage.set_index(['parent', 'child', 'grandchild', 'grandgrandchild']))

parent,child,grandchild,grandgrandchild
n01662784,n01663401,n01664065,
n01662784,n01663401,n01663782,
n01662784,n01663401,n01664369,n01664674
n01662784,n01663401,n01664369,n01664492
n01662784,n01663401,n01664990,
n01662784,n01663401,n01665541,
n01662784,n01665932,n01666228,
n01662784,n01665932,n01666585,
n01662784,n01667114,n01667432,
n01662784,n01667778,n01668091,


For parents with a single child, we decided to go with the parent class. An example for this is the WnID *n01667778*: this refers to *Terrapin* (http://www.image-net.org/synset?wnid=n01667778) which has only the *Diamondback Terrapin* as a child. This will also be useful for the bounding box data since if boxes are available they are available for the parent in such cases, but not the child.

In [6]:
kept_WnIDs = turtle_lineage['grandgrandchild']
kept_WnIDs.loc[~turtle_lineage['grandchild'].duplicated(keep=False)] = turtle_lineage['grandchild'][~turtle_lineage['grandchild'].duplicated(keep=False)]
kept_WnIDs.loc[~turtle_lineage['child'].duplicated(keep=False)] = turtle_lineage['child'][~turtle_lineage['child'].duplicated(keep=False)]

Now we can take a look at the associated species.

In [7]:
WnID_imagenet_class_relation = pd.read_csv(
    './data/WnID_to_classname.txt', header=None, sep='	').rename(columns={0: 'WnID', 1: 'Label'}).set_index('WnID')
WnID_turtle_species_relation = WnID_imagenet_class_relation.loc[kept_WnIDs.values]
WnID_turtle_species_relation = WnID_turtle_species_relation['Label'].str.split(',', expand=True)[0].str.title()
turtle_information = WnID_turtle_species_relation.rename('Species').reset_index().set_index('Species')

In [8]:
print('We have data for {} species.'.format(len(turtle_information)))
display(turtle_information)

We have data for 22 species.


Unnamed: 0_level_0,WnID
Species,Unnamed: 1_level_1
Loggerhead,n01664065
Green Turtle,n01663782
Pacific Ridley,n01664674
Atlantic Ridley,n01664492
Hawksbill Turtle,n01664990
Leatherback Turtle,n01665541
Common Snapping Turtle,n01666228
Alligator Snapping Turtle,n01666585
Mud Turtle,n01667114
Terrapin,n01667778


We downloaded the tar files for these WnIDs. The files are provided in ``./data/archives/images``.

## Now we look at bounding boxes

In [9]:
with open('./data/WnID_bounding_boxes.txt', 'r') as f:
    bounding_box_WnIDs = [line.strip() for line in f.readlines() if line.strip() in kept_WnIDs.values]

In [10]:
turtle_information['bboxes'] = False
turtle_information.loc[turtle_information['WnID'].isin(bounding_box_WnIDs), 'bboxes'] = True

In [11]:
print('We have bounding boxes for {} species'.format(len(bounding_box_WnIDs)))
print('Specifically, the following species.')
display(turtle_information.loc[turtle_information['bboxes']])

We have bounding boxes for 5 species
Specifically, the following species.


Unnamed: 0_level_0,WnID,bboxes
Species,Unnamed: 1_level_1,Unnamed: 2_level_1
Loggerhead,n01664065,True
Leatherback Turtle,n01665541,True
Mud Turtle,n01667114,True
Terrapin,n01667778,True
Box Turtle,n01669191,True


We downloaded the tar.gz files for these bounding boxes. The files are provided in ``./data/archives/bounding_boxes``.

# Train-test-validation split and file extraction

In [12]:
import tarfile
import os

Before extracting the files, we put together a tabel of what we got.

In [13]:
image_level_information = list()
for img_archive in os.listdir('./data/archives/images'):
    with tarfile.open('./data/archives/images/' + img_archive) as tar:
        for member in tar:
            image_level_information.append(member.name)

In [14]:
bbox_information = list()
for img_archive in os.listdir('./data/archives/bounding_boxes'):
    with tarfile.open('./data/archives/bounding_boxes/' + img_archive) as tar:
        for member in tar:
            if member.name.endswith('.xml'):
                bbox_information.append(member.name)
            else:
                continue

In [15]:
print('We have a total of {} images and bounding box data for {} images.'.format(
    len(image_level_information), len(bbox_information)))

We have a total of 21643 images and bounding box data for 2437 images.


In [16]:
image_level_statistics = pd.DataFrame(image_level_information, columns=['img_name'])
image_level_statistics = pd.DataFrame(image_level_statistics['img_name'].str.rstrip('.JPEG'))
image_level_statistics['WnID'] = image_level_statistics['img_name'].str.split('_', expand=True)[0]
image_level_statistics.set_index('img_name', inplace=True)
img_with_bbox = pd.Series(bbox_information).str.split('/', expand=True)[2].str.rstrip('.xml').str.strip().values
image_level_statistics['bboxes'] = False
image_level_statistics.loc[image_level_statistics.index.isin(img_with_bbox), 'bboxes'] = True

In [17]:
print('We do not have an image for every bounding box. We could make the connection for {} images.'.format(
    image_level_statistics['bboxes'].sum()))

We do not have an image for every bounding box. We could make the connection for 2151 images.


In [18]:
# merge the image count per species
turtle_information = pd.merge(turtle_information.reset_index(), 
                              image_level_statistics.groupby('WnID').count().rename(columns={'bboxes': 'n_images'}).reset_index(),
                              on='WnID')
# mergeturtle_information the count of images with bboxes per species
turtle_information = pd.merge(turtle_information, 
                             image_level_statistics.groupby('WnID').sum().rename(columns={'bboxes': 'n_bbox_imgs'}).reset_index(),
                             on='WnID')

turtle_information.set_index('Species', inplace=True)

In [19]:
print('Number of images per species')
display(turtle_information['n_images'])
print('------\n')
print('Number of images with bounding boxes')
display(turtle_information.loc[turtle_information['bboxes'],'n_bbox_imgs'].astype(int))

Number of images per species


Species
Loggerhead                   1208
Green Turtle                 1199
Pacific Ridley               1121
Atlantic Ridley              1001
Hawksbill Turtle             1195
Leatherback Turtle            969
Common Snapping Turtle       1172
Alligator Snapping Turtle     773
Mud Turtle                    940
Terrapin                     1056
Red-Bellied Terrapin          615
Slider                       1288
Cooter                       1213
Box Turtle                   1328
Painted Turtle               1192
European Tortoise            1079
Giant Tortoise               1183
Gopher Tortoise              1264
Desert Tortoise              1163
Texas Tortoise                356
Spiny Softshell               160
Smooth Softshell              168
Name: n_images, dtype: int64

------

Number of images with bounding boxes


Species
Loggerhead            436
Leatherback Turtle    415
Mud Turtle            408
Terrapin              414
Box Turtle            478
Name: n_bbox_imgs, dtype: int64

## Now we decide for each image whether it is supposed to go into train, validation, or test set.
Since the final evaluation metric is top-1 and top-3 error, we do not need bounding boxes in the test set. Since bounding boxes are not only not for all species, but also not for all images within a species available, this is helpful. 

We are aiming want a stratified 60-20-20 split across all images. Meanwhile, the bounding box images should have a 80-20-0 split.

In [20]:
image_level_statistics['sample'] = 'train'

First we do the split stratified by bounding boxes. Only have to set 20% to validation here within species.

In [21]:
for wnid in bounding_box_WnIDs:
    seed = 91420171 # date and first seed.
    filt = image_level_statistics['bboxes'] & (image_level_statistics['WnID'] == wnid)
    draw = image_level_statistics.loc[filt].sample(frac=0.2, random_state=seed).index
    image_level_statistics.loc[draw, 'sample']= 'val'

Next, we have to take 20% of the images per species and put them in the test sample, making sure that we only draw from those that are not annotated

In [22]:
for wnid in image_level_statistics['WnID'].unique():
    # Now we take the train data for which we don't have boxes and split it to get an overall 60% train 
    # and 20% test set. Thus, the test set is 25% of the train set(0.25*0.8 = 0.2) corresponding to
    # 0.25 * n_total_train_images. We take test images only from the not annotated images, so 
    # the proportion from these is 0.25 * n_total_train_images / n_not_annotated_train_images
    seed = 91420172
    filt_non_bb = ~image_level_statistics['bboxes'] 
    filt_wnid = (image_level_statistics['WnID'] == wnid)
    frac = 0.2 * (filt_wnid.sum()/ (filt_wnid & filt_non_bb).sum())
    draw = image_level_statistics.loc[filt_non_bb & filt_wnid].sample(frac=frac, random_state=seed).index
    image_level_statistics.loc[draw, 'sample']= 'test'
    
    # Now we have to balance the validation set so that we have a 60-20-20 split while maintaining the 
    # 80-20-0 split for the bounding box annotated images. We first count the number of images already
    # in the validation set and then adjust.
    seed = 91420173
    filt_training = (image_level_statistics['sample'] == 'train')
    filt_validation = (image_level_statistics['sample'] == 'val')
    n_add_to_validation = int(0.2 * filt_wnid.sum() - (filt_wnid & filt_validation).sum())
    draw = image_level_statistics.loc[filt_non_bb & filt_wnid & filt_training].sample(
        n=n_add_to_validation, random_state=seed).index
    image_level_statistics.loc[draw, 'sample']= 'val'

Now we have to check that this all worked.

In [23]:
print('The 80-20 training validation split for the bounding box data worked well')
pd.concat([
    image_level_statistics.loc[
        image_level_statistics['bboxes'] & (image_level_statistics['sample'] == 'train')
    ].groupby('WnID')['sample'].count().rename('training'),
    image_level_statistics.loc[
        image_level_statistics['bboxes'] & (image_level_statistics['sample'] == 'val')
    ].groupby('WnID')['sample'].count().rename('validation')
], axis=1
).divide(
    image_level_statistics.loc[
        image_level_statistics['bboxes']].groupby('WnID')['sample'].count(), axis='index'
).rename(
    columns={})

The 80-20 training validation split for the bounding box data worked well


Unnamed: 0_level_0,training,validation
WnID,Unnamed: 1_level_1,Unnamed: 2_level_1
n01664065,0.800459,0.199541
n01665541,0.8,0.2
n01667114,0.79902,0.20098
n01667778,0.799517,0.200483
n01669191,0.799163,0.200837


In [24]:
print('...and so did the overall 60-20-20 split')
pd.concat([
    image_level_statistics.loc[
        (image_level_statistics['sample'] == 'train')
    ].groupby('WnID')['sample'].count().rename('training'),
    image_level_statistics.loc[
        (image_level_statistics['sample'] == 'val')
    ].groupby('WnID')['sample'].count().rename('validation'),
    image_level_statistics.loc[
        (image_level_statistics['sample'] == 'test')
    ].groupby('WnID')['sample'].count().rename('test'),
], axis=1
).divide(
    image_level_statistics.groupby('WnID')['sample'].count(), axis='index'
).rename(
    columns={})

...and so did the overall 60-20-20 split


Unnamed: 0_level_0,training,validation,test
WnID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
n01663782,0.6005,0.199333,0.200167
n01664065,0.600166,0.199503,0.200331
n01664492,0.6004,0.1998,0.1998
n01664674,0.600357,0.199822,0.199822
n01664990,0.6,0.2,0.2
n01665541,0.600619,0.199174,0.200206
n01666228,0.600683,0.199659,0.199659
n01666585,0.600259,0.199224,0.200517
n01667114,0.6,0.2,0.2
n01667778,0.600379,0.199811,0.199811


With this, we can move on to extract the files into their appropriate destinations.

In [25]:
os.mkdir('./data/images/train')
os.mkdir('./data/images/val')
os.mkdir('./data/images/test')

def get_sample(member):
    return image_level_statistics.loc[member.name.rstrip('.JPEG'), 'sample']

for tar_file in os.listdir('./data/archives/images'):
    wnid = tar_file.rstrip('.tar')
    os.mkdir(os.path.join('./data/images/train', wnid))
    os.mkdir(os.path.join('./data/images/val', wnid))
    os.mkdir(os.path.join('./data/images/test', wnid))
    with tarfile.open(os.path.join('./data/archives/images',tar_file)) as tar:
        for member in tar:
            tar.extract(member, 
                        os.path.join('./data/images', get_sample(member), wnid))

Quick check that this worked

In [26]:
import numpy as np

In [27]:
for wnid in image_level_statistics['WnID'].unique():
    folder_size = np.array(
        [len(os.listdir('./data/images/train/' + wnid)),
         len(os.listdir('./data/images/val/' + wnid)),
         len(os.listdir('./data/images/test/' + wnid))])
    print(wnid)
    print(folder_size/np.sum(folder_size))

n01664674
[ 0.60035682  0.19982159  0.19982159]
n01665541
[ 0.6006192   0.19917441  0.2002064 ]
n01664065
[ 0.60016556  0.19950331  0.20033113]
n01668665
[ 0.60015528  0.19953416  0.20031056]
n01666228
[ 0.60068259  0.1996587   0.1996587 ]
n01664492
[ 0.6003996  0.1998002  0.1998002]
n01670802
[ 0.60016906  0.19949281  0.20033812]
n01672611
[ 0.60119048  0.19642857  0.20238095]
n01672432
[ 0.6  0.2  0.2]
n01670535
[ 0.60055607  0.19925857  0.20018536]
n01668436
[ 0.6  0.2  0.2]
n01663782
[ 0.60050042  0.19933278  0.20016681]
n01671125
[ 0.60047468  0.19936709  0.20015823]
n01667778
[ 0.60037879  0.19981061  0.19981061]
n01666585
[ 0.60025873  0.1992238   0.20051746]
n01671705
[ 0.6011236  0.1994382  0.1994382]
n01668892
[ 0.60016488  0.19950536  0.20032976]
n01667114
[ 0.6  0.2  0.2]
n01671479
[ 0.60017197  0.19948409  0.20034394]
n01669191
[ 0.6001506   0.19954819  0.2003012 ]
n01664990
[ 0.6  0.2  0.2]
n01669654
[ 0.60067114  0.19966443  0.19966443]


Looks good!

Finally, extract the bounding box data

In [28]:
os.mkdir('./data/bounding_boxes/train')
os.mkdir('./data/bounding_boxes/val')

def get_sample(img_name):
    return image_level_statistics.loc[img_name, 'sample']

for tar_file in os.listdir('./data/archives/bounding_boxes'):
    wnid = tar_file.rstrip('.tar.gz')
    os.mkdir(os.path.join('./data/bounding_boxes/train', wnid))
    os.mkdir(os.path.join('./data/bounding_boxes/val', wnid))
    with tarfile.open(os.path.join('./data/archives/bounding_boxes',tar_file)) as tar:
        for member in tar:
            img_name = member.name.split('/')[-1].rstrip('.xml')
            member.name = img_name + '.xml'
            if img_name in image_level_statistics.index:
                tar.extract(member, 
                            os.path.join('./data/bounding_boxes', get_sample(img_name), wnid))
            else:
                continue

Check whether this worked as well

In [29]:
for wnid in image_level_statistics.loc[image_level_statistics['bboxes'],'WnID'].unique():
    folder_size = np.array(
        [len(os.listdir('./data/bounding_boxes/train/' + wnid)),
         len(os.listdir('./data/bounding_boxes/val/' + wnid))])
    print(wnid)
    print(folder_size/np.sum(folder_size))

n01665541
[ 0.8  0.2]
n01664065
[ 0.80045872  0.19954128]
n01667778
[ 0.79951691  0.20048309]
n01667114
[ 0.79901961  0.20098039]
n01669191
[ 0.79916318  0.20083682]


Nice :)

# Get some more statistic for the images.

### Check for broken data

In [30]:
from PIL import Image

In [31]:
import warnings
broken_data = dict()

for sample in ('train', 'val', 'test'):
    for wnid in image_level_statistics['WnID'].unique():
        path = os.path.join('data', 'images', sample, wnid)
        imgs = os.listdir(path)
        for img in imgs:
            with warnings.catch_warnings():
                warnings.filterwarnings('error')
                try:
                    im = Image.open(os.path.join(path,img))
                except Warning as e:
                    broken_data[os.path.join(path,img)] = e

print('We have {} broken images.'.format(len(broken_data)))
print(broken_data)

We have 4 broken images.


Since we have so few broken images, we are just going to delete them.

In [36]:
for path in broken_data.keys():
    img_name = path.split('/')[-1].rstrip('.JPEG')
    image_level_statistics.drop(img_name, inplace=True)
    os.remove(path)
    print('Dropped and deleted {}'.format(img_name))

Dropped and deleted n01664674_3751
Dropped and deleted n01664492_10783
Dropped and deleted n01664492_1155
Dropped and deleted n01664674_1260


### Look at image resolutions

In [40]:
image_level_statistics['pixel_w'] = None
image_level_statistics['pixel_h'] = None

In [41]:
def get_resolution(sample, wnid, img_name):
    path = os.path.join('./data/images/', sample, wnid, img_name + '.JPEG')
    im = Image.open(path)
    return(im.size)

In [42]:
for row in image_level_statistics.reset_index().iterrows():
    row = row[1] #dont want the index
    image_level_statistics.loc[
        row['img_name'],['pixel_w', 'pixel_h']
    ] = get_resolution(row['sample'], row['WnID'], row['img_name'])

In [98]:
image_level_statistics[['pixel_w', 'pixel_h']] = image_level_statistics[
    ['pixel_w', 'pixel_h']].astype(int)

In [99]:
print('Descriptives width and height')
image_level_statistics[['pixel_w', 'pixel_h']].describe().astype(int)

Descriptives width and height


Unnamed: 0,pixel_w,pixel_h
count,21639,21639
mean,461,347
std,223,174
min,48,38
25%,375,286
50%,500,357
75%,500,375
max,4000,3658


Do the same by species.

In [87]:
# append species
image_level_statistics = pd.merge(
    image_level_statistics.reset_index(), 
    turtle_information.reset_index()[['WnID', 'Species']],
    how='left', on='WnID')

In [102]:
image_level_statistics.groupby('Species')[
    ['pixel_w', 'pixel_h']].describe().astype(int)

Unnamed: 0_level_0,pixel_w,pixel_w,pixel_w,pixel_w,pixel_w,pixel_w,pixel_w,pixel_w,pixel_h,pixel_h,pixel_h,pixel_h,pixel_h,pixel_h,pixel_h,pixel_h
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Alligator Snapping Turtle,773,453,241,64,320,500,500,2560,773,345,184,45,242,350,375,1920
Atlantic Ridley,999,421,233,55,272,500,500,3264,999,319,209,42,200,375,376,3658
Box Turtle,1328,486,68,140,500,500,500,1342,1328,367,64,120,333,375,375,967
Common Snapping Turtle,1172,468,210,60,375,500,500,2272,1172,350,172,60,277,352,375,2048
Cooter,1213,437,279,60,225,500,500,3361,1213,322,202,38,165,333,386,2112
Desert Tortoise,1163,483,235,87,473,500,500,3072,1163,364,193,72,333,375,375,3072
European Tortoise,1079,430,293,55,209,400,524,3264,1079,331,232,55,170,301,432,2448
Giant Tortoise,1183,485,119,96,500,500,500,1920,1183,373,109,72,333,375,375,2560
Gopher Tortoise,1264,483,284,67,400,500,500,3264,1264,357,210,72,296,363,375,2448
Green Turtle,1199,465,213,87,400,500,500,2560,1199,356,154,61,320,375,377,1712


# Read the bounding box information from the xml files

In [138]:
import xml.etree.ElementTree as ET

def get_box_coordinates_from_xml(wnid, sample, img_name):
    xml_path = os.path.join('./data/bounding_boxes/', sample, wnid,
                            img_name + '.xml') 
    tree = ET.parse(xml_path)
    objects = tree.findall('object')
    boxes = []
    for object in objects:
        bbox = object.find('bndbox')
        box_coords = [str(child.text) for child in bbox]
        boxes.append(box_coords)
    return boxes

In [150]:
bounding_boxes_by_img = image_level_statistics.loc[
    image_level_statistics['bboxes']].apply(
    lambda row: get_box_coordinates_from_xml(row['WnID'], 
                                             row['sample'],
                                             row['img_name']), 
    axis=1).rename('box_coords')

In [151]:
print('We encounter the following number of boxes per image:')
print(bounding_boxes_by_img.apply(len).unique())

We encounter the following number of boxes per image:
[1 2 3 4 6 5]


In [153]:
image_level_statistics = pd.concat(
    [image_level_statistics, bounding_boxes_by_img], axis=1)

In [163]:
image_level_statistics = pd.concat(
    [image_level_statistics,
     image_level_statistics.loc[image_level_statistics['bboxes'],  
                                'box_coords'].apply(len).rename('n_boxes')], 
    axis=1)

In [168]:
print('Let\'s look at the number of boxes by species')
image_level_statistics[image_level_statistics['bboxes']].groupby('Species')['n_boxes'].describe()

Let's look at the number of boxes by species


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Box Turtle,478.0,1.008368,0.09119,1.0,1.0,1.0,1.0,2.0
Leatherback Turtle,415.0,1.031325,0.200198,1.0,1.0,1.0,1.0,3.0
Loggerhead,436.0,1.004587,0.067651,1.0,1.0,1.0,1.0,2.0
Mud Turtle,408.0,1.029412,0.219711,1.0,1.0,1.0,1.0,4.0
Terrapin,414.0,1.176329,0.590744,1.0,1.0,1.0,1.0,6.0


There are few images with more than one box: the 75% for each species is still a single box!

In [172]:
print('Looking at this in more detail, we have the following number of images with more than one box.')
image_level_statistics[image_level_statistics['n_boxes'] > 1].groupby('Species')['n_boxes'].count()

Looking at this in more detail, we have the following number of images with more than one box.


Species
Box Turtle             4
Leatherback Turtle    11
Loggerhead             2
Mud Turtle             9
Terrapin              49
Name: n_boxes, dtype: int64

In [187]:
print('The total number of boxes available is {}'.format(
    int(image_level_statistics['n_boxes'].sum())))

The total number of boxes available is 2255


In [170]:
image_level_statistics.to_csv(
    './results/data_analysis/image_level_statistics.csv')

# Finally, we prepare a txt file that we use for training the F-RCNN
This matches the structure for the *simple-parser*: `filepath,x1,y1,x2,y2,class_name,imageset`.
The first stage trains merely for the distinction of turtletoise vs background. Thus, we only have a single class: *turtletoise*!

In [190]:
simple_parse = []
for row in image_level_statistics[
    image_level_statistics['bboxes']].iterrows():
    sample = row[1]['sample']
    wnid = row[1]['WnID']
    img_file = row[1]['img_name'] + '.JPEG'
    boxes = row[1]['box_coords']
    filepath = os.path.join('./data/images/', sample, wnid,
                           img_file)
    for box in boxes:
        simple_parse.append(','.join([filepath, ','.join(box),
                                       'turtletoise', sample]) + '\n')

In [191]:
with open('./f_rcnn_simple_parser_first_stage.txt', 'w') as f:
    f.writelines(simple_parse)

# Looking at a specific example for the train-validation-test split

In [1]:
import pandas as pd
image_level_statistics = pd.read_csv('./results/data_analysis/image_level_statistics.csv')

In [15]:
leatherback_n_imgs = image_level_statistics.loc[image_level_statistics.Species == 'Leatherback Turtle', 'bboxes'].count()
leatherback_n_bbox_imgs = image_level_statistics.loc[(image_level_statistics.Species == 'Leatherback Turtle') 
                                                 & image_level_statistics.bboxes, 'bboxes'].count()

In [21]:
print('For Leatherbacks, we have {} images total, {} of which also have bounding boxes'.format(leatherback_n_imgs, leatherback_n_bbox_imgs))

For Leatherbacks, we have 969 images total, 415 of which also have bounding boxes


In [11]:
leatherback_train_test_val = image_level_statistics[
    image_level_statistics.Species == 'Leatherback Turtle'].groupby('sample')['bboxes'].agg(['count', 'sum'])

In [19]:
leatherback_train_test_val

Unnamed: 0_level_0,count,sum
sample,Unnamed: 1_level_1,Unnamed: 2_level_1
test,194,0.0
train,582,332.0
val,193,83.0


In [20]:
print('In percent')
pd.concat([leatherback_train_test_val['count']/leatherback_n_imgs,
           leatherback_train_test_val['sum']/leatherback_n_bbox_imgs], axis=1) * 100   

In percent


Unnamed: 0_level_0,count,sum
sample,Unnamed: 1_level_1,Unnamed: 2_level_1
test,20.02064,0.0
train,60.06192,80.0
val,19.917441,20.0


In [23]:
leatherback_n_imgs - leatherback_n_bbox_imgs - 194

360

In [24]:
582-332

250