This is some simple code to generate text files, used in the batch generators (see [`kerasImgGenerator.ipynb`](http://ec2-54-153-84-85.us-west-1.compute.amazonaws.com:8890/notebooks/coco/Notebooks/dataHandling/kerasImgGenerator.ipynb)).

In [1]:
import os, sys, re
import os.path as osp

# global variables

# directory paths
CURRDIR = osp.abspath('.')
# to put in module : 
# CURRDIR = osp.dirname(__file__)
DATADIR = osp.abspath(osp.join(CURRDIR, '..', '..', 'data'))
TRAINDATA = osp.join(DATADIR, 'train2014')
VALDATA = osp.join(DATADIR, 'val2014')
TESTDATA = osp.join(DATADIR, 'test2015')
ANNDIR = osp.join(DATADIR, 'annotations')

# file paths
filenameTP = "person_keypoints_{}.json"
modes = ['train', 'val', 'test']
mode_date = ['train2014', 'val2014', 'test2015']

filenames = [osp.join(ANNDIR,
                      filenameTP.format(el))\
             for el in mode_date]
annFiles = dict([el for el in zip(modes,filenames)])

# init empty annotation files
anns = {}.fromkeys(modes)

In [2]:
from pycocotools.coco import COCO
from warnings import warn

def load_anns(mode, unload=False):
    '''
        Loads (or unloads) mode annotations in memory.
        mode is either 'train', 'val', or 'test'.
    '''
    if not unload:
        if not anns[mode]:
            anns[mode] = COCO(annFiles[mode])
        else:
            msg = '{} annotations are already loaded.'
            warn(msg.format(mode))
    else:
        anns[mode] = None

In [3]:
def getDirList(mode):
    if mode=='train':
        directory = TRAINDATA
    elif mode=='val':
        directory = VALDATA
    elif mode=='test':
        directory = TESTDATA
    
    f = []
    for (dirpath, dirnames, filenames) in os.walk(directory):
        f.extend(filenames)
        break
    
    return f

def generateDataList(mode, bbox=False):
    dirList = getDirList(mode)
    filename = str(mode)
    
    if bbox:
        if not anns[mode]:
            load_anns(mode)
        
        annFile = anns[mode]
        catIds = annFile.getCatIds()
        filename += '_bbox'
    
    filename += '.txt'
    
    if os.path.isfile('./'+filename):
        warn('{} already exists. '.format(filename) +\
             'Erase it before generating it again.')
        return None
        
    f = open(filename, 'w')
    
    for path in dirList:
        try:
            img_id = re.search('([[1-9][0-9]*]?).jpg', path).group(1)
        except AttributeError:
            print("{} was ignored".format(path))
            continue
            
        else:
            if bbox:
                # list all bboxes in annotations
                # add line for each of them
                imgId = int(img_id)
                annIds = annFile.getAnnIds(imgIds=imgId,
                                           catIds=catIds)

                for annId in annIds:
                    ann_id = str(annId)
                    line = ' '.join([ann_id, img_id, path, '\n'])
                    f.write(line)
                
            else:
                line = ' '.join([img_id, path, '\n'])
                f.write(line)
    
    print('Succesfully saved {}'.format(filename))
    f.close()
    return None

In [7]:
for mode in ['train', 'val']:
    generateDataList(mode)
    generateDataList(mode, bbox=True)

Succesfully saved train.txt
Succesfully saved train_bbox.txt
Succesfully saved val.txt
Succesfully saved val_bbox.txt


In [4]:
load_anns('train')
load_anns('val')

loading annotations into memory...
Done (t=9.93s)
creating index...
index created!
loading annotations into memory...
Done (t=4.40s)
creating index...
index created!


In [16]:
import random

# Step 1 : remove train bboxes with area under 32²

newTrain = open('train_bbox_noS.txt', 'w')

with open('train_bbox.txt', 'r') as train:
    lines = train.readlines()
    for l in lines:
        ann_id, img_id, img_name = l.split(' ')[:3]
        annDict = anns['train'].loadAnns(int(ann_id))[0]
        w, h = annDict['bbox'][2:4]
        if w*h > 1024:
            newTrain.write('{}\n'.format(l))
        
newTrain.close()

# Step 2 : do the same for val data

newVal = open('val_bbox_noS.txt', 'w')

with open('val_bbox.txt', 'r') as val:
    lines = val.readlines()
    for l in lines:
        ann_id, img_id, img_name = l.split(' ')[:3]
        annDict = anns['val'].loadAnns(int(ann_id))[0]
        w, h = annDict['bbox'][2:4]
        if w*h > 1024:
            newVal.write('{}\n'.format(l))

newVal.close()

# Step 3 : shuffle the previous file for use in training

newVal_s = open('val_bbox_noS_shuffled.txt', 'w')

with open('val_bbox_noS.txt', 'r') as val_noS:
    data = [(random.random(), line) for line in val_noS]
    data.sort()
    
    for _, l in data:
        newVal_s.write(l)

newVal_s.close()

In [13]:
valanns = anns['val']
valanns.loadAnns(188229)

[{u'area': 253.1772,
  u'bbox': [138.4, 269.56, 15.71, 35.22],
  u'category_id': 1,
  u'id': 188229,
  u'image_id': 280293,
  u'iscrowd': 0,
  u'keypoints': [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  u'num_keypoints': 0,
  u'segmentation': [[149.35,
    303.35,
    148.24,
    292.57,
    150.14,
    286.22,
    152.68,
    290.82,
    154.11,
    290.19,
    150.78,
    282.09,
    149.51,
    276.06,
    146.97,
    274.32,
    146.18,
    270.83,
    144.59,
    269.56,
    141.73,
    271.62,
    142.21,
    276.86,
    139.67,
    278.13,
    139.99,
    283.84,
    138.4,
    289.55,
    139.67,
    292.09,
    140.78,
    292.25,
    140.78,
    288.28,
    141.73,
    284.47,
    143.16,
    292.41,
    144.27,
    297