In [24]:
#Import libraries
import pandas as pd
from xml.dom import minidom
import os
import sys

In [23]:
#Global variables
ILSVRC_ROOT_PATH = '/var/ifs/data/ILSVRC'
ANNOTATIONS_PATH = ILSVRC_ROOT_PATH + '/Annotations/CLS-LOC'
ANNOTATIONS_TRAIN_PATH = ANNOTATIONS_PATH + '/train'
ANNOTATIONS_VAL_PATH = ANNOTATIONS_PATH + '/val'
IMAGES_PATH = ILSVRC_ROOT_PATH + '/Data/CLS-LOC'
IMAGES_TRAIN_PATH = IMAGES_PATH + '/train'

In [61]:
#Parsing Training Annotations
'''
Esempio struttura
Annotations/CLS-LOC/train
   n02787622 (root_id_dir)
      n02787622_1.xml
      n02787622_2.xml
      . . .

/Data/CLS-LOC/train
   n02787622
      n02787622_1.jpg
      n02787622_2.jpg
'''
def extract_image_channels(xml_path):
    #Check if annotated depth is 3
    xml_obj = minidom.parse(xml_path)
    depth_element = xml_obj.getElementsByTagName('depth')
    depth_str = depth_element[0].firstChild.data
    try:
        return int(depth_str)
    except:
        return 0

def extract_image_dims(xml_path):
    xml_obj = minidom.parse(xml_path)
    width_element = xml_obj.getElementsByTagName('width')
    height_element = xml_obj.getElementsByTagName('height')
    
    try:
        im_width = int(width_element[0].firstChild.data)
        im_height = int(height_element[0].firstChild.data)
        return  im_width * im_height
    except:
        return 0

pandas_df_list = []
removed_images = 0
for idx, root_id_dir in enumerate(os.listdir(ANNOTATIONS_TRAIN_PATH)):
    current_dir_path = ANNOTATIONS_TRAIN_PATH + '/' + root_id_dir
    for img_idx, img_xml in enumerate(os.listdir(current_dir_path)):
        #Parse XML
        full_xml_path = current_dir_path + '/' + img_xml
        #Check if image is depth 3
        im_depth = extract_image_channels(full_xml_path)
        if im_depth == 3:
            #Image OK
            current_image_path = '/ILSVRC/Data/CLS-LOC/train/' + root_id_dir + '/' + img_xml.split('.')[0] + '.JPEG'
            current_label = root_id_dir
            pandas_df_list.append((current_image_path, current_label, extract_image_dims(full_xml_path)))
        else:
            #Ignore image
            removed_images += 1
        if (img_idx * (idx + 1)) % 20 == 0:
            sys.stdout.write("\rProcessed folders: {0} Processed images {1}".format(idx+1, img_idx+1))
            sys.stdout.flush()

print("Process completed")
print("Overall images {0}, removed {1} images".format(len(pandas_df_list), removed_images))

Processed folders: 1000 Processed images 494Process completed
Overall images 544546, removed 0 images


In [62]:
images_df = pd.DataFrame(pandas_df_list, columns=['path','label', 'im_dim'])
len(images_df)

544546

In [63]:
images_df.head(5)

Unnamed: 0,path,label,im_dim
0,/ILSVRC/Data/CLS-LOC/train/n02823750/n02823750...,n02823750,166500
1,/ILSVRC/Data/CLS-LOC/train/n02823750/n02823750...,n02823750,167000
2,/ILSVRC/Data/CLS-LOC/train/n02823750/n02823750...,n02823750,187500
3,/ILSVRC/Data/CLS-LOC/train/n02823750/n02823750...,n02823750,187500
4,/ILSVRC/Data/CLS-LOC/train/n02823750/n02823750...,n02823750,166500


In [64]:
images_df.to_csv(path_or_buf='images_df.csv', sep=';', header=True, index=False)