# Image Net Preprocessing
Notebook di processamento delle immagini di Image Net. Obiettivo è realizzare un batch input che, sfruttando il meccasnismo a code descritto in <a href=https://www.tensorflow.org/programmers_guide/reading_data>Tensorflow</a>, fornisca batch della dimensione desiderata per il numero di epoche desiderato.

Viene inoltre sfruttanto l'algoritmo di <a href=https://github.com/tensorflow/models/blob/master/slim/preprocessing/inception_preprocessing.py>Inception preprocessing</a> per fornire in input immagini della dimensione corretta con le correzioni preaddestramento fornite da Tensorflow

In [21]:
#Import
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import random
from PIL import Image
#Inception preprocessing code from https://github.com/tensorflow/models/blob/master/slim/preprocessing/inception_preprocessing.py
#useful to maintain training dimension
from utils import inception_preprocessing
import sys

In [2]:
#Global Variables
IMAGE_NET_ROOT_PATH = '/var/ifs/data/tiny-imagenet-200/'
#IMAGE_NET_ROOT_PATH = '/data/lgrazioli/'
IMAGE_NET_LABELS_PATH = IMAGE_NET_ROOT_PATH + 'words.txt'
IMAGE_NET_TRAIN_PATH = IMAGE_NET_ROOT_PATH + 'train/'

### Lettura file words di ImageNet
Lettura del file words di ImageNet come PandaDF. A ogni id (cartella che contiene immagini per le classi fornite) vengono assegnati i label

In [13]:
#Reading label file as Panda dataframe
labels_df = pd.read_csv(IMAGE_NET_LABELS_PATH, sep='\\t', header=None, names=['id','labels'])
labels_df.head(5)

  from ipykernel import kernelapp as app


Unnamed: 0,id,labels
0,n00001740,entity
1,n00001930,physical entity
2,n00002137,"abstraction, abstract entity"
3,n00002452,thing
4,n00002684,"object, physical object"


In [14]:
labels_df.count()

id        82115
labels    82114
dtype: int64

Aggiunta colonna di lunghezza del label (quante classi contiene ogni label).

In [5]:
#new_labels = []
labels_lengths = []
for idx, row in labels_df.iterrows():
    #Convertire a stringa perchè alcuni sono float
    current_labels = tuple(str(row['labels']).split(','))
    #new_labels.append(current_labels)
    labels_lengths.append(len(current_labels))

In [6]:
labels_df['labels_length'] = labels_lengths
labels_indices = [idx for idx, _ in labels_df.iterrows()]
labels_df['indices'] = labels_indices

In [7]:
labels_df.head(20)

Unnamed: 0,id,labels,labels_length,indices
0,n00001740,entity,1,0
1,n00001930,physical entity,1,1
2,n00002137,"abstraction, abstract entity",2,2
3,n00002452,thing,1,3
4,n00002684,"object, physical object",2,4
5,n00003553,"whole, unit",2,5
6,n00003993,congener,1,6
7,n00004258,"living thing, animate thing",2,7
8,n00004475,"organism, being",2,8
9,n00005787,benthos,1,9


### Train DF
Panda Dataframe che contiene i path di tutte le immagini, la relativa classe (per fare poi lookup sul label indicizzato) e id della classe. 
<b>Può richiedere del tempo. Per lanciare su un campione si può bloccare a un determinato valore di idx</b>

In [18]:
train_paths = []
for idx, label_dir in enumerate(os.listdir(IMAGE_NET_TRAIN_PATH)):
    image_dir_path = IMAGE_NET_TRAIN_PATH + label_dir + '/images/'
    print("Processing label {0}".format(label_dir))
    for image in os.listdir(image_dir_path):
        train_paths.append((image_dir_path + image, 
                            image.split('.')[0].split('_')[0],
                            image.split('.')[0].split('_')[1]))
train_df = pd.DataFrame(train_paths, columns=['im_path','class', 'im_class_id'])
print(train_df.count())
train_df.head()

Processing label n07747607
Processing label n02917067
Processing label n03400231
Processing label n04179913
Processing label n03837869
Processing label n02074367
Processing label n03100240
Processing label n03617480
Processing label n04507155
Processing label n02481823
Processing label n01910747
Processing label n01784675
Processing label n01774750
Processing label n01855672
Processing label n03970156
Processing label n02948072
Processing label n03733131
Processing label n02909870
Processing label n04596742
Processing label n03089624
Processing label n04398044
Processing label n07720875
Processing label n03814639
Processing label n01774384
Processing label n03447447
Processing label n02669723
Processing label n02085620
Processing label n02814533
Processing label n02125311
Processing label n02791270
Processing label n04560804
Processing label n02843684
Processing label n01950731
Processing label n02113799
Processing label n09256479
Processing label n04487081
Processing label n02321529
P

Unnamed: 0,im_path,class,im_class_id
0,/var/ifs/data/tiny-imagenet-200/train/n0774760...,n07747607,290
1,/var/ifs/data/tiny-imagenet-200/train/n0774760...,n07747607,427
2,/var/ifs/data/tiny-imagenet-200/train/n0774760...,n07747607,339
3,/var/ifs/data/tiny-imagenet-200/train/n0774760...,n07747607,11
4,/var/ifs/data/tiny-imagenet-200/train/n0774760...,n07747607,400


Pulizia delle immagini che non sono nel formato desiderato da inception_preprocessing (3 canali). 
<b>Operazione lunga!</b>

In [23]:
#Remove black and white images
uncorrect_images = 0
#Salvataggio indici di immagini da eliminare
to_remove_indexes = []
for idx, record in train_df.iterrows():
    #Leggo immagine come np.array
    im_array = np.array(Image.open(record['im_path']))
    #Se non ha 3 canali la aggiungo a quelle da eliminare
    if im_array.shape[-1] != 3:
        uncorrect_images += 1
        to_remove_indexes.append(idx)
    if idx % 20 == 0:
        sys.stdout.write("\rProcessed {0} images".format(idx))
        sys.stdout.flush()

#Rimozione righe identificate
train_df = train_df.drop(train_df.index[to_remove_indexes])

print("New size: {0}".format(len(train_df)))
print("Removed {0} images".format(uncorrect_images))

Processed 99980 imagesNew size: 98179
Removed 1821 images


In [24]:
#Eventuale campionamento da passare al generatore input
example_file_list = list(train_df.im_path)
len(example_file_list)

98179

### Input pipeline
Definizione della input pipeline al modello TF

<b>NB: La memoria della GPU non va MAI oltre i 100MB!</b>

In [30]:
EPOCHS = 5
BATCH_SIZE = 128
#Serve per capire quando il generatore è passato a batch appartenenti a una nuova epoca 
BATCH_PER_EPOCH = np.ceil(len(example_file_list) / BATCH_SIZE)

def parse_single_image(filename_queue):
    #Dequeue a file name from the file name queue
    filename = filename_queue.dequeue()
    #Read image
    raw = tf.read_file(filename)
    #convert in jpg (in GPU!)
    jpeg_image = tf.image.decode_jpeg(raw)
    #Preprocessing with inception preprocessing
    jpeg_image = inception_preprocessing.preprocess_image(jpeg_image, 300, 300, is_training=True)
    return jpeg_image
#jpeg_image = parse_single_image(filename_queue)

def get_batch(filenames, batch_size, num_epochs=None):
    #Coda lettura file
    filename_queue = tf.train.input_producer(example_file_list)
    
    #Lettura singolo record
    jpeg_image = parse_single_image(filename_queue)
    
    # min_after_dequeue defines how big a buffer we will randomly sample
    #   from -- bigger means better shuffling but slower start up and more
    #   memory used.
    # capacity must be larger than min_after_dequeue and the amount larger
    #   determines the maximum we will prefetch.  Recommendation:
    #   min_after_dequeue + (num_threads + a small safety margin) * batch_size
    min_after_dequeue = 10
    capacity = min_after_dequeue + 3 * batch_size
    
    #tensors è la lista dei tensori delle single feature e immagini. Esegue batch_size volte i tensori example e label per ottenere il batch
    #num_threads incrementa effettivamente l'utilizzo della CPU (confermato dal throughput visisible sul cloudera manager,
    #resta comunque un throughput lento ....
    example_batch = tf.train.shuffle_batch(
        tensors=[jpeg_image], batch_size=batch_size, capacity=capacity,
        min_after_dequeue=min_after_dequeue, allow_smaller_final_batch=True, num_threads=8)
    
    return example_batch

#TF Graph, per ora recupera solamente un batch
x = get_batch(example_file_list, batch_size=BATCH_SIZE)


In [None]:
#GPU config
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
    #Start populating the filename queue.
    coord = tf.train.Coordinator()
    #Senza questa chiamata non partono i thread per popolare la coda che permette di eseguire la read
    threads = tf.train.start_queue_runners(coord=coord)
    #Current epoch and step servono a capire quando cambiare epoca e quando fermarsi
    current_epoch = 0
    current_step = 0
    while current_epoch < EPOCHS: 
        x_batch = sess.run(x)
        #print(x_batch.shape)
        if current_step % 10 == 0:
            #print("Batch shape {}".format(x_batch.shape))
            print("Current step: {0}".format(current_step))
        #Cambiare epoca, raggiunto il massimo per l'epoca corrente
        if current_step == (BATCH_PER_EPOCH - 1):
            current_epoch += 1
            current_step = 0
            print("EPOCH {0}".format(current_epoch))
        #Epoche terminate -> chiudere
        if current_epoch >= EPOCHS:
            break
        current_step +=  1
    #for i in range(10):
        #converted_im = sess.run(jpeg_image)
        #print(converted_im.shape)
        
    #Chiusura del coordinator (chiudi i thread di lettura)
    coord.request_stop()
    coord.join(threads)
    sess.close()

Current step: 0
Current step: 10
Current step: 20
Current step: 30
Current step: 40
Current step: 50
Current step: 60
Current step: 70
Current step: 80
Current step: 90
Current step: 100
Current step: 110
Current step: 120
Current step: 130
Current step: 140
Current step: 150
Current step: 160
Current step: 170
Current step: 180
Current step: 190
Current step: 200
Current step: 210
Current step: 220
Current step: 230
Current step: 240
Current step: 250
Current step: 260
Current step: 270
Current step: 280
Current step: 290
Current step: 300
Current step: 310
Current step: 320
Current step: 330
Current step: 340
Current step: 350
Current step: 360
Current step: 370
Current step: 380
Current step: 390
Current step: 400
Current step: 410
Current step: 420
Current step: 430
Current step: 440
Current step: 450
Current step: 460
Current step: 470
Current step: 480
Current step: 490
Current step: 500
Current step: 510
Current step: 520
Current step: 530
Current step: 540
Current step: 550
Cur

## TO Be Defined

In [12]:
#Count distinct delle directory immagini
distinct_df = labels_df.groupby('id').labels.nunique()
for idx, d in enumerate(distinct_df):
    if d > 1:
        print("Id {0}, counter {1}".format(idx, d))
        
#Nessuna cartella viene utilizzata per più classi!

### Label one hot encoding
Ogni sequenza di classi è una classe stessa

In [9]:
num_classes = len(set(labels_df['labels']))

In [10]:
num_classes

76003

In [12]:
labels_indices = [idx for idx, _ in labels_df.iterrows()]

In [113]:
def process_image(im_tensor):
    return inception_preprocessing.preprocess_image(im_tensor, 300, 300, is_training=True)
    

def get_batches(images_df, labels, batch_size):
    for i in range(0, len(images_df), batch_size):
        #Batch sul Dataframe dei riferimenti al training set
        x_images_df = images_df[i:i+batch_size]
        x = []
        y = []
        #Ciclo su tutto il batch
        for idx, x_image in x_images_df.iterrows():
            #Appendo il numpy array dell'immagine all'array del batch già normalizzato tra 0 e 1
            x.append(np.array(Image.open(str(x_image['im_path']))) / 255.0)
            #Recupero il label corrispondente nel dataset
            corresponding_label = labels_df[labels_df['id'] == x_image['class']]
            #Appendo l'indice
            y.append(int(corresponding_label['indices']))
        
        yield x,y

#batches = get_batches(None, labels_indices, 128)
#for y in batches:
    #print(len(y))

In [99]:
labels_df['indices'] = labels_indices

In [17]:
smpl_label = [ labels_indices[i] for i in sorted(random.sample(range(len(labels_indices)), 128)) ]

In [19]:
len(smpl_label)

128

In [30]:
for i in range(0, 10, 2):
    print(i)

0
2
4
6
8


In [107]:
for idx, record in train_df[0:1].iterrows():
    print(np.array(Image.open(record['im_path'])))
    corresponding_label = labels_df[labels_df['id'] == record['class']]
    print(int(corresponding_label['indices']))

[[[184 127  40]
  [186 133  57]
  [180 134  74]
  ..., 
  [155 137  97]
  [163 135  96]
  [165 131  93]]

 [[199 147  63]
  [182 134  60]
  [171 129  71]
  ..., 
  [158 117  55]
  [188 136  76]
  [171 112  52]]

 [[177 134  55]
  [172 132  62]
  [177 142  88]
  ..., 
  [190 116  27]
  [195 111  21]
  [203 114  24]]

 ..., 
 [[240 153  58]
  [246 157  63]
  [249 158  67]
  ..., 
  [233 228 186]
  [228 230 191]
  [225 231 195]]

 [[243 153  57]
  [254 161  68]
  [255 158  67]
  ..., 
  [231 226 188]
  [226 227 193]
  [222 229 195]]

 [[248 153  59]
  [255 163  72]
  [255 157  67]
  ..., 
  [230 225 187]
  [225 226 192]
  [221 228 195]]]
42217


In [115]:
y_class = tf.placeholder(tf.int32)
labels = tf.one_hot(y_class, num_classes)
im_tensor = tf.placeholder(tf.float32, (None, None, 3))
input_train_tensor = inception_preprocessing.preprocess_image(im_tensor, 300, 300, is_training=True)

In [118]:
sample_epochs = 5
config = tf.ConfigProto(log_device_placement=True)
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
    for ep in range(sample_epochs):
        print()
        batches = get_batches(train_df, labels_indices, 128)
        for x, y in batches:
            one_hot = sess.run(labels, feed_dict={y_class:y})
            tensor_x_list = []
            for record_x in x:
                record_train_x = sess.run(input_train_tensor, feed_dict={im_tensor: record_x})
                tensor_x_list.append(record_train_x)
            print(tensor_x_list)
            print(one_hot.shape)
            print('----------')


[array([[[ 0.26116574,  0.30951869, -0.09488744],
        [ 0.26116574,  0.30951869, -0.09488744],
        [ 0.26116574,  0.30951869, -0.09488744],
        ..., 
        [ 0.67668211,  0.55366063,  0.1908673 ],
        [ 0.65780628,  0.53255773,  0.17533231],
        [ 0.63893044,  0.51145458,  0.15979719]],

       [[ 0.26127648,  0.30860364, -0.09528959],
        [ 0.26127648,  0.30860364, -0.09528959],
        [ 0.26127648,  0.30860364, -0.09528959],
        ..., 
        [ 0.66873968,  0.54430962,  0.18536592],
        [ 0.64742982,  0.52070761,  0.16720188],
        [ 0.62611997,  0.49710572,  0.14903796]],

       [[ 0.26138711,  0.30768859, -0.0956918 ],
        [ 0.26138711,  0.30768859, -0.0956918 ],
        [ 0.26138711,  0.30768859, -0.0956918 ],
        ..., 
        [ 0.66079712,  0.53495848,  0.17986453],
        [ 0.63705337,  0.50885761,  0.1590718 ],
        [ 0.6133095 ,  0.48275673,  0.13827896]],

       ..., 
       [[ 0.81932259,  0.54239237,  0.13359046],
      

[array([[[ 0.53785086,  0.12461066, -0.90848964],
        [ 0.53785086,  0.12461066, -0.90848964],
        [ 0.53785086,  0.12461066, -0.90848964],
        ..., 
        [ 0.44373322, -0.02021807, -0.84761673],
        [ 0.44373322, -0.02164656, -0.84475976],
        [ 0.44373322, -0.02307498, -0.84190279]],

       [[ 0.53785086,  0.1258862 , -0.91359133],
        [ 0.53785086,  0.1258862 , -0.91359133],
        [ 0.53785086,  0.1258862 , -0.91359133],
        ..., 
        [ 0.44061339, -0.0225386 , -0.85328746],
        [ 0.43890536, -0.02543724, -0.85213852],
        [ 0.43719721, -0.02833551, -0.85098964]],

       [[ 0.53785086,  0.12716162, -0.91869307],
        [ 0.53785086,  0.12716162, -0.91869307],
        [ 0.53785086,  0.12716162, -0.91869307],
        ..., 
        [ 0.43749356, -0.02485895, -0.85895807],
        [ 0.4340775 , -0.02922761, -0.85951728],
        [ 0.43066132, -0.0335961 , -0.86007643]],

       ..., 
       [[ 0.4202038 ,  0.27480459, -0.51341271],
       

[array([[[-1.        , -0.70934159, -1.        ],
        [-1.        , -0.70934159, -1.        ],
        [-1.        , -0.70934159, -1.        ],
        ..., 
        [-1.        , -0.99807364, -1.        ],
        [-1.        , -0.98311937, -1.        ],
        [-1.        , -0.96816516, -1.        ]],

       [[-1.        , -0.7239821 , -1.        ],
        [-1.        , -0.7239821 , -1.        ],
        [-1.        , -0.7239821 , -1.        ],
        ..., 
        [-1.        , -0.94964141, -1.        ],
        [-1.        , -0.93092942, -1.        ],
        [-1.        , -0.91221744, -1.        ]],

       [[-1.        , -0.73862261, -1.        ],
        [-1.        , -0.73862261, -1.        ],
        [-1.        , -0.73862261, -1.        ],
        ..., 
        [-1.        , -0.90120912, -1.        ],
        [-1.        , -0.87873942, -1.        ],
        [-1.        , -0.85626972, -1.        ]],

       ..., 
       [[-0.82963139,  0.20830548, -0.71430498],
       

[array([[[ 0.32245398, -0.18100554, -0.79634529],
        [ 0.27578735, -0.23528016, -0.8525219 ],
        [ 0.22912061, -0.28955466, -0.90869832],
        ..., 
        [-0.62656569, -0.91525346, -1.        ],
        [-0.62656569, -0.91525346, -1.        ],
        [-0.62656569, -0.91525346, -1.        ]],

       [[ 0.41385269, -0.03299594, -0.66549033],
        [ 0.35716832, -0.10953659, -0.74305832],
        [ 0.3004837 , -0.18607748, -0.8206262 ],
        ..., 
        [-0.60131079, -0.9108696 , -1.        ],
        [-0.60131079, -0.9108696 , -1.        ],
        [-0.60131079, -0.9108696 , -1.        ]],

       [[ 0.50525141,  0.1150142 , -0.53463531],
        [ 0.43854916,  0.01620698, -0.63359475],
        [ 0.37184691, -0.08260036, -0.73255426],
        ..., 
        [-0.57605582, -0.90687776, -1.        ],
        [-0.57605582, -0.90687776, -1.        ],
        [-0.57605582, -0.90687776, -1.        ]],

       ..., 
       [[ 0.51068926, -0.0263344 , -0.87662196],
       

[array([[[-0.33173281, -0.35494572, -0.42458439],
        [-0.33173281, -0.35494572, -0.42458439],
        [-0.33173281, -0.35494572, -0.42458439],
        ..., 
        [-0.20320356, -0.0324598 , -0.01100075],
        [-0.24256724, -0.06434387, -0.04587662],
        [-0.28193086, -0.09622777, -0.08075243]],

       [[-0.31270015, -0.3373574 , -0.40699607],
        [-0.31270015, -0.3373574 , -0.40699607],
        [-0.31270015, -0.3373574 , -0.40699607],
        ..., 
        [-0.19876218, -0.03067607, -0.01010281],
        [-0.23672575, -0.06199765, -0.04469562],
        [-0.27468938, -0.0933193 , -0.07928836]],

       [[-0.29366744, -0.31976908, -0.38940775],
        [-0.29366744, -0.31976908, -0.38940775],
        [-0.29366744, -0.31976908, -0.38940775],
        ..., 
        [-0.19432074, -0.02889216, -0.00920486],
        [-0.23088437, -0.05965155, -0.04351461],
        [-0.26744795, -0.09041089, -0.07782435]],

       ..., 
       [[-0.74741912, -0.75515676, -0.7861073 ],
       

ValueError: Cannot feed value of shape (64, 64) for Tensor 'Placeholder_7:0', which has shape '(?, ?, 3)'

### Reading images 

In [65]:
for label_dir in os.listdir(IMAGE_NET_TRAIN_PATH):
    image_dir_path = IMAGE_NET_TRAIN_PATH + label_dir + '/images/'
    for image in os.listdir(image_dir_path):
        print(image_dir_path + image)
        im_array = np.array(Image.open(image_dir_path + image))
        im_tensor = preprocess_image(im_array, 300, 300, is_training=True)
        print(preprocess_image(im_tensor.shape))

/var/ifs/data/tiny-imagenet-200/train/n07747607/images/n07747607_290.JPEG


TypeError: data type not understood

In [9]:
from utils import inception_preprocessing

In [58]:
im_tensor = tf.placeholder(tf.float32, (None, None, 3))
input_train_tensor = inception_preprocessing.preprocess_image(im_tensor, 300, 300, is_training=True)

In [73]:
IMAGE_PATH = '/data/lgrazioli/042_hardtest.jpeg'
from PIL import Image

im = Image.open(IMAGE_PATH)
#im = im.resize((300,300), Image.ANTIALIAS)


im_array = np.array((np.array(im) / 255.0).astype(np.float32))
im_array = np.expand_dims(im_array, axis=0)

In [74]:
with tf.Session(config=config) as sess:
    pre_trained_im = sess.run(input_train_tensor, feed_dict={im_tensor: im_array[0]})
    print(pre_trained_im.shape)

(300, 300, 3)


In [14]:
100 / 16

6.25

In [133]:
np.array(Image.open(example_file_list[0])).shape

(64, 64, 3)