# Split into train/dev/test

1. Split Coco `train2017` into train and validation sets
    1. Split into train and development using 80/20 from each of the disjoint categories.
    2. Select 20 at random from each of the categories of interest for `train1`.
2. Use Coco `val2017` as test set

Save image IDs into `.txt` files

## Imports

In [1]:
%matplotlib inline
from pycocotools.coco import COCO

import json
import numpy as np
import pandas as pd
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (8.0, 10.0)

## Load `strCatNms_to_imgIds` mapping from `train2017` to do (1.A)

In [2]:
dataDir='..'
dataType='train2017'
annFile='{}/annotations/instances_{}.json'.format(dataDir, dataType)

In [3]:
# initialize COCO api for instance annotations
coco=COCO(annFile)

loading annotations into memory...
Done (t=7.20s)
creating index...
index created!


In [4]:
with open(f'../my_annotations/strCatNms_to_imgIds_{dataType}.json', 'r') as f:
    strCatNms_to_imgIds = json.load(f)
strCatNms_to_ImgIdsNum = {key: len(value) for (key, value) in strCatNms_to_imgIds.items()}
strCatNms_to_ImgIdsNum

{'': 46223,
 'bird': 2351,
 'bird cat': 63,
 'bird cat dog': 4,
 'bird cat dog person': 3,
 'bird cat person': 3,
 'bird dog': 24,
 'bird dog person': 36,
 'bird person': 753,
 'cat': 3199,
 'cat dog': 155,
 'cat dog person': 46,
 'cat person': 641,
 'dog': 2153,
 'dog person': 1964,
 'person': 60669}

In [5]:
train_ids = []
dev_ids = []


for key in strCatNms_to_imgIds:
    img_ids = strCatNms_to_imgIds[key].copy()
    n = len(img_ids)
    
    # shuffle, but set seed for reproducibility
    np.random.seed(42)
    np.random.shuffle(img_ids)
    
    cutoff = int(n*0.8)
    
    train_ids_in_this_cat = img_ids[:cutoff]
    dev_ids_in_this_cat = img_ids[cutoff:]
    
    print(key, len(train_ids_in_this_cat), len(dev_ids_in_this_cat))
    print('\t', train_ids_in_this_cat[:5], dev_ids_in_this_cat[:5])
    
    train_ids.extend(train_ids_in_this_cat)
    dev_ids.extend(dev_ids_in_this_cat)

 36978 9245
	 [125616, 249356, 137123, 15684, 149598] [564789, 507381, 181122, 364470, 241528]
bird 1880 471
	 [351852, 575758, 64300, 412355, 97195] [499516, 76310, 427679, 4283, 504169]
bird cat 50 13
	 [293757, 569975, 318594, 379620, 236941] [365619, 536175, 62231, 84783, 196393]
bird cat dog 3 1
	 [456438, 99645, 87456] [108923]
bird cat dog person 2 1
	 [345434, 392035] [257909]
bird cat person 2 1
	 [244933, 321861] [173814]
bird dog 19 5
	 [374564, 367699, 451976, 373346, 105918] [315427, 38440, 100689, 298468, 234785]
bird dog person 28 8
	 [178431, 39081, 457442, 64233, 16957] [495695, 333951, 565087, 250516, 428746]
bird person 602 151
	 [135045, 519899, 248793, 303318, 449071] [75719, 538444, 65307, 212672, 429741]
cat 2559 640
	 [235700, 207282, 386619, 342244, 39171] [5469, 319865, 338882, 174527, 575923]
cat dog 124 31
	 [173825, 143824, 316008, 70754, 117108] [117946, 542060, 299163, 181079, 374922]
cat dog person 36 10
	 [307423, 481212, 427965, 530811, 124122] [1319, 

In [6]:
len(train_ids), len(dev_ids)

(94623, 23664)

In [7]:
len(train_ids) + len(dev_ids)

118287

In [8]:
len(train_ids) / (len(train_ids) + len(dev_ids))

0.7999442035050344

## For each of the generic categories of interest choose 20 images and put into `train1_ids`

## CHOOSE: `CATEGORIES_OF_INTEREST`

In [9]:
CATEGORIES_OF_INTEREST = sorted(['person', 'dog', 'cat', 'bird'])
CATEGORIES_OF_INTEREST

['bird', 'cat', 'dog', 'person']

In [10]:
IMGS_PER_CLASS = 20

In [11]:
train1_ids = []

for catNm in CATEGORIES_OF_INTEREST:
    catIds = coco.getCatIds(catNms=catNm)
    imgIds = coco.getImgIds(catIds=catIds)
    train_ids_in_this_cat = set(imgIds).intersection(set(train_ids))
    print(catNm, catIds, len(imgIds), len(train_ids_in_this_cat))
    
    np.random.seed(42)
    random_ids = np.random.choice(list(train_ids_in_this_cat), IMGS_PER_CLASS)
    
    train1_ids.extend(random_ids)
    
print(len(train1_ids))


# add IMGS_PER_CLASS images the rest
rest_ids_all = strCatNms_to_imgIds['']
train_ids_rest = set(rest_ids_all).intersection(set(train_ids))

np.random.seed(42)
random_ids_rest = np.random.choice(list(train_ids_rest), IMGS_PER_CLASS)
print('rest', '[no specific id]', len(rest_ids_all), len(train_ids_rest))
train1_ids.extend(random_ids_rest)


#print(type(train1_ids[0]))
train1_ids = np.array(train1_ids).tolist() # make sure to use python native int - makes saving with json possible
#print(type(train1_ids[0]))

print(len(train1_ids))

bird [16] 3237 2586
cat [17] 4114 3288
dog [18] 4385 3505
person [1] 64115 51288
80
rest [no specific id] 46223 36978
100


In [12]:
%ls ../my_splits/

dev_ids.txt     test_ids.txt    train1_ids.txt  train_ids.txt


In [13]:
type(train1_ids[0]), type(train_ids[0])

(int, int)

### Get ids from `val2017` and save them as `test_ids`

In [14]:
dataDir='..'
dataType='val2017'
annFile='{}/annotations/instances_{}.json'.format(dataDir, dataType)

In [15]:
# initialize COCO api for instance annotations
coco=COCO(annFile)

loading annotations into memory...
Done (t=0.23s)
creating index...
index created!


In [16]:
len(coco.getImgIds())

5000

In [17]:
test_ids = coco.getImgIds()

### Save `train`, `dev`, `train1`, and `test` ids

In [18]:
with open('../my_splits/train_ids.txt', 'w') as f:
    f.write(json.dumps(train_ids))
with open('../my_splits/dev_ids.txt', 'w') as f:
    f.write(json.dumps(dev_ids))
    
with open('../my_splits/train1_ids.txt', 'w') as f:
    f.write(json.dumps(train1_ids))
    
with open('../my_splits/test_ids.txt', 'w') as f:
    f.write(json.dumps(test_ids))

In [19]:
#Now read the file back into a Python list object
with open('../my_splits/train_ids.txt', 'r') as f:
    train_ids = json.loads(f.read())
with open('../my_splits/dev_ids.txt', 'r') as f:
    dev_ids = json.loads(f.read())

with open('../my_splits/train1_ids.txt', 'r') as f:
    train1_ids = json.loads(f.read())

with open('../my_splits/test_ids.txt', 'r') as f:
    test_ids = json.loads(f.read())

**Note**, all sets have their annotation files in `../my_annotations/`
* train and development set in files with`*_train2017*`
* test set in files with `*_val2017*`