In [6]:
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image

import numpy as np
import gc
import time
import glob
import re
import os
from pathlib import Path

In [10]:
path = Path(os.getcwd())
root = Path(path.parent.absolute()) 

preprocessed_image_path = root / 'Shared Preprocessed Objects' / 'Preprocessed Images for Inception'
fetching_path = root / 'Shared Preprocessed Objects'

In [11]:
train_image_paths = np.load(fetching_path / "train_image_paths.npy", allow_pickle=True)
test_image_paths = np.load(fetching_path / "test_image_paths.npy", allow_pickle=True)

In [12]:
print(len(train_image_paths), len(test_image_paths))

100 50


In [13]:
# Deprecated because of extreme long time to run and OOM issues
'''def preprocess_images_inceptionV3(image_paths, batch_size = 32, show_progress = True):
    features = {}
    index = 0
    
    while index + batch_size < len(image_paths):
        
        if show_progress:
            print('extracted ' + str(index) + ' image features out of ' + str(len(image_paths)))
            
        batch = image_paths[index:index+batch_size]
        img_batch = preprocess_image_batch(batch)
        #encoded_batch = model.predict(img_batch, batch_size = batch_size)
        
        for num, path in enumerate(batch):
            imageId = path[path.index('ROCO_'):-4]
            features[imageId] = img_batch[num]
        
        index += batch_size
    
    batch = image_paths[index:]
    img_batch = preprocess_image_batch(batch)
    for num, path in enumerate(batch):
        imageId = path[path.index('ROCO_'):-4]
        features[imageId] = img_batch[num]
    
    return features'''

In [14]:
# Preprocess image batch so that it can be input into InceptionV3.
def preprocess_image_batch(images_path):
    tmp = []
    
    for image_path in images_path:
        img = image.load_img(image_path, target_size = (299,299))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis = 0)
        x = preprocess_input(x)
        
        tmp.append(np.array(x))
    
    tmp = np.array(tmp)
    tmp = tmp.reshape((len(images_path), 299, 299, 3))
    
    return tmp

In [31]:
def preprocess_image_batch_inceptionV3(image_paths, file_prefix, batch_size = 32, show_progress = True):
    
    index = 0
    batch_count = 0
    while index + batch_size < len(image_paths):
        # Get image paths 
        paths = glob.glob(str(preprocessed_image_path) + '\*')
        paths = [re.sub('\\\\', '//', path) for path in paths]
        paths = '\t'.join(paths)
        
        # Checking if the file already exists since this is very slow
        # If the file already exists, skip this step. This is done because this takes a long time
        # and you may face timeouts from colab, for example
        if str(preprocessed_image_path) + file_prefix + str(batch_count) in paths:
            print('skipped file ' + preprocessed_image_path + file_prefix + str(batch_count) + ' because it already exists')
            index += batch_size
            batch_count += 1
        
        # If it doesnt exist, then create it
        else:
            features = {}
            gc.collect()
            start = time.time()
            
            # Define the batch
            batch = image_paths[index:index+batch_size]
            # Preprocess images
            img_batch = preprocess_image_batch(batch)
            
            # store on dict dict
            for num, path in enumerate(batch):
                imageId = path[path.index('ROCO_'):-4]
                features[imageId] = img_batch[num]
            
            # Save
            np.save(preprocessed_image_path / (file_prefix + str(batch_count)), features)

            index += batch_size
            batch_count += 1

            if show_progress:
                print('Preprocessed ' + str(index) + ' images out of ' + str(len(image_paths)))
                print('Took ' + str(time.time() - start) + ' seconds to process this batch')
    
    # Save last Batch one-by-one
    features = {}
    batch = image_paths[index:]
    img_batch = preprocess_image_batch(batch)
    for num, path in enumerate(batch):
        imageId = path[path.index('ROCO_'):-4]
        features[imageId] = img_batch[num]
    
    np.save(preprocessed_image_path / (file_prefix + str(batch_count)), features)
    return features

In [30]:
# This takes a long time!!! Around 3 hours on my personal setting
print('--- Preprocessing Train Images ---')
sample_train_images_preprocessed = preprocess_image_batch_inceptionV3(train_image_paths, 'train_images_preprocessed_batch_', batch_size = 10)

print('--- Preprocessing Test Images ---')
sample_test_images_preprocessed = preprocess_image_batch_inceptionV3(test_image_paths, 'test_images_preprocessed_batch_', batch_size = 10)


--- Preprocessing Train Images ---
Preprocessed 10 images out of 100
Took 0.09227919578552246 seconds to process this batch
Preprocessed 20 images out of 100
Took 0.08899879455566406 seconds to process this batch
Preprocessed 30 images out of 100
Took 0.11802840232849121 seconds to process this batch
Preprocessed 40 images out of 100
Took 0.0854032039642334 seconds to process this batch
Preprocessed 50 images out of 100
Took 0.09400343894958496 seconds to process this batch
Preprocessed 60 images out of 100
Took 0.10695528984069824 seconds to process this batch
Preprocessed 70 images out of 100
Took 0.09799742698669434 seconds to process this batch
Preprocessed 80 images out of 100
Took 0.09999752044677734 seconds to process this batch
Preprocessed 90 images out of 100
Took 0.07800102233886719 seconds to process this batch
--- Preprocessing Test Images ---
Preprocessed 10 images out of 50
Took 0.10400581359863281 seconds to process this batch
Preprocessed 20 images out of 50
Took 0.092