In [1]:
import csv
import subprocess
import os
from tqdm import tqdm
import multiprocessing
from multiprocessing import Pool as thread_pool

In [4]:
ROOT_DIR = '/plate-detector/'
DATA_DIR = ROOT_DIR +'/data/OPEN_IMAGES_DATASET/'

cpu_count = multiprocessing.cpu_count()
nthreads = cpu_count*2
occluded = 1 # Include occluded images
truncated = 1 # Include truncated images
groupOf = 1 # Include groupOf images
depiction = 1 # Include depiction images
inside = 1 # Include inside images
mode = 'validation' # Dataset category - train, validation or test
run_mode = DATA_DIR + mode
classes = ['Vehicle_registration_plate'] # Names of object classes to be downloaded

In [5]:
with open(DATA_DIR + 'class-descriptions-boxable.csv', mode='r') as infile:
    reader = csv.reader(infile)
    dict_list = {rows[1]:rows[0] for rows in reader}

subprocess.run(['rm', '-rf', run_mode])
subprocess.run([ 'mkdir', run_mode])

pool = thread_pool(nthreads)
commands = []
cnt = 0

for ind in range(0, len(classes)):
    
    class_name = classes[ind]
    print("Class "+str(ind) + " : " + class_name)
    
    subprocess.run([ 'mkdir', run_mode+'/'+class_name])

    command = "grep "+dict_list[class_name.replace('_', ' ')] + " " + run_mode + "-annotations-bbox.csv"
    class_annotations = subprocess.run(command.split(), stdout=subprocess.PIPE).stdout.decode('utf-8')
    class_annotations = class_annotations.splitlines()

    for line in class_annotations:

        line_parts = line.split(',')
        
        #IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
        if (occluded==0 and int(line_parts[8])>0):
            print("Skipped %s",line_parts[0])
            continue
        if (truncated==0 and int(line_parts[9])>0):
            print("Skipped %s",line_parts[0])
            continue
        if (groupOf==0 and int(line_parts[10])>0):
            print("Skipped %s",line_parts[0])
            continue
        if (depiction==0 and int(line_parts[11])>0):
            print("Skipped %s",line_parts[0])
            continue
        if (inside==0 and int(line_parts[12])>0):
            print("Skipped %s",line_parts[0])
            continue

        cnt = cnt + 1
        command = 'aws s3 --no-sign-request --only-show-errors cp s3://open-images-dataset/'+mode+'/'+line_parts[0]+'.jpg '+ run_mode+'/'+class_name+'/'+line_parts[0]+'.jpg'
        commands.append(command)
        
        with open('%s/%s/%s.txt'%(run_mode,class_name,line_parts[0]),'a') as f:
            f.write(','.join([class_name, line_parts[4], line_parts[5], line_parts[6], line_parts[7]])+'\n')

print("Annotation Count : "+str(cnt))
commands = list(set(commands))
print("Number of images to be downloaded : "+str(len(commands)))

list(tqdm(pool.imap(os.system, commands), total = len(commands) ))

pool.close()
pool.join()

print(len(os.listdir(DATA_DIR + mode + '/' + classes[0]))/2, 'images were downloaded')

Class 0 : Vehicle_registration_plate


  0%|          | 0/386 [00:00<?, ?it/s]

Annotation Count : 512
Number of images to be downloaded : 386


100%|██████████| 386/386 [08:37<00:00,  1.34s/it]


386.0 images were downloaded
