In [0]:
### Author : Narayan Parthasarathy & Venkat Mohanram

In [0]:
## IMPORT NECESSARY PACKAGES

import os
from os import listdir
from pickle import dump
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array

from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input as vgg16_ppi

from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input as inception_ppi

from keras.applications.xception import Xception
from keras.applications.xception import preprocess_input as xception_ppi

import pandas as pd
import numpy as np
from keras.models import Model
from tqdm import tqdm

Using TensorFlow backend.


In [0]:
!pip install rarfile
import rarfile



In [0]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
# SET THE PATHS FOR FILES
## SET FOLDER PATHS --> ## Change these paths to suit your local / google drive 
project_path  = '/content/drive/My Drive/CAPSTONE/'
base_image_folder = '/content/drive/My Drive/CAPSTONE/Datasets/Master/Master/'
image_folder = '/opt/bin/CAPSTONE/Datasets/Master/Master/'
features_path = '/content/drive/My Drive/CAPSTONE/Features/'
feature_file_vgg16 = features_path + 'features_vgg16_master_1.pkl'
feature_file_inception = features_path + 'features_incep_master_1.pkl'
feature_file_xception = features_path + 'features_xcep_master_1.pkl'

In [0]:
os.chdir(base_image_folder)
#rar = rarfile.RarFile('Master.rar')
#rar.extractall()
os.chdir(project_path)

In [0]:
# extract features from each image using Xception model. Return is a features object. Same can be dumped to file in a separate call.
# The target size of VGG16 is 224X224
# The target size of Xception is 299X299
# The target size of Xception is 299X299

def extract_features(directory,model_name):
    # load the model
    if model_name == 'VGG16':
        model = VGG16()
    elif model_name == 'INCEPTIONV3' :
        model = InceptionV3()
    elif model_name == 'XCEPTION' :
        model = Xception()
    
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    # summarize
    print(model.summary())
    # extract features from each photo
    features = dict()
    for name in listdir(directory):
        # load an image from file
        filename = directory + name
        if model_name == 'VGG16':
            image = load_img(filename, target_size=(224, 224))
        elif model_name == 'INCEPTIONV3' :
            image = load_img(filename, target_size=(299, 299))
        elif model_name == 'XCEPTION' :
            image = load_img(filename, target_size=(299, 299))
        
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        # prepare the image for the VGG model
        if model_name == 'VGG16':
            image = vgg16_ppi(image)
        elif model_name == 'INCEPTIONV3' :
            image = inception_ppi(image)
        elif model_name == 'XCEPTION' :
            image = xception_ppi(image)
            
        # get features
        feature = model.predict(image, verbose=0)
        # get image id
        image_id = name.split('.')[0]
        # store feature
        features[image_id] = feature
        #print('>%s' % name)
    return features

In [0]:
# extract features from each image using Xception model. Return is a features object. Same can be dumped to file in a separate call.
# The target size of VGG16 is 224X224
# The target size of Xception is 299X299
# The target size of Xception is 299X299

def extract_features_1(directory,model_name):
    # load the model
    if model_name == 'VGG16':
        model = VGG16()
    elif model_name == 'INCEPTIONV3' :
        model = InceptionV3()
    elif model_name == 'XCEPTION' :
        model = Xception()
    
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    # summarize
    print(model.summary())
    # extract features from each photo
    features = dict()
    for name in tqdm(listdir(directory)):
        # load an image from file
        filename = directory + name
        try:
          if model_name == 'VGG16':
            image = load_img(filename, target_size=(224, 224))
          elif model_name == 'INCEPTIONV3' :
            image = load_img(filename, target_size=(299, 299))
          elif model_name == 'XCEPTION' :
            image = load_img(filename, target_size=(299, 299))
        
          # convert the image pixels to a numpy array
          image = img_to_array(image)
          # reshape data for the model
          image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
          # prepare the image for the VGG model
          if model_name == 'VGG16':
            image = vgg16_ppi(image)
          elif model_name == 'INCEPTIONV3' :
            image = inception_ppi(image)
          elif model_name == 'XCEPTION' :
            image = xception_ppi(image)
            
          # get features
          feature = model.predict(image, verbose=0)
          # get image id
          image_id = name.split('.')[0]
          # store feature
          features[image_id] = feature
          #print('>%s' % name)
        except OSError as ose:
          notanimg = open(project_path + 'tmp/' + model_name + '_not_an_image.txt', 'a')
          notanimg.write(name+'\n')
    return features

In [0]:
# extract features from each image using Xception model. Return is a features object. Same can be dumped to file in a separate call.
# The target size of VGG16 is 224X224
# The target size of Xception is 299X299
# The target size of Xception is 299X299

def extract_features_2(directory,model_name):
    os.chdir(directory)
    # load the model
    if model_name == 'VGG16':
        model = VGG16()
    elif model_name == 'INCEPTIONV3' :
        model = InceptionV3()
    elif model_name == 'XCEPTION' :
        model = Xception()
    
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    # summarize
    print(model.summary())
    # calculation for faster processing
    allfiles = listdir(directory)
    # specify the number of parts to split for multiple loops
    parts = 5
    totalfiles = len(allfiles)
    fsplit = round(totalfiles/parts)
    slist = []
    for i in range(parts):
      s=fsplit*(i+1)
      slist.append(s)
    start = 0
    fs = list()
    for end in slist:
      fs.append((start,end))
      start = end
    # extract features from each photo
    features = dict()
    # running parallel loop for sub feature extraction
    sub_features_1 = extract_sub_features(allfiles[fs[0][0]:fs[0][1]],model_name,model)
    sub_features_2 = extract_sub_features(allfiles[fs[1][0]:fs[1][1]],model_name,model)
    sub_features_3 = extract_sub_features(allfiles[fs[2][0]:fs[2][1]],model_name,model)
    sub_features_4 = extract_sub_features(allfiles[fs[3][0]:fs[3][1]],model_name,model)
    sub_features_5 = extract_sub_features(allfiles[fs[4][0]:fs[4][1]],model_name,model)
    features.update(sub_features_1)
    features.update(sub_features_2)
    features.update(sub_features_3)
    features.update(sub_features_4)
    features.update(sub_features_5)
    return features

In [0]:
def extract_sub_features(files,model_name,model):
  features = dict()
  for name in tqdm(files):
    # load an image from file
    #filename = name
    try:
      if model_name == 'VGG16':
        image = load_img(name, target_size=(224, 224))
      elif model_name == 'INCEPTIONV3' :
        image = load_img(name, target_size=(299, 299))
      elif model_name == 'XCEPTION' :
        image = load_img(name, target_size=(299, 299))
      # convert the image pixels to a numpy array
      image = img_to_array(image)
      # reshape data for the model
      image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
      # prepare the image for the VGG model
      if model_name == 'VGG16':
        image = vgg16_ppi(image)
      elif model_name == 'INCEPTIONV3' :
        image = inception_ppi(image)
      elif model_name == 'XCEPTION' :
        image = xception_ppi(image)
            
      # get features
      feature = model.predict(image, verbose=0)
      # get image id
      image_id = name.split('.')[0]
      # store feature
      features[image_id] = feature
      #print('>%s' % name)
    except OSError as ose:
      notanimg = open(project_path + 'tmp/' + model_name + '_not_an_image.txt', 'a')
      notanimg.write(name+'\n')
  return features

In [0]:
#listdir(image_folder)
!df -PTh

Filesystem     Type        Size  Used Avail Use% Mounted on
overlay        overlay      69G   36G   30G  55% /
tmpfs          tmpfs        64M     0   64M   0% /dev
tmpfs          tmpfs       6.4G     0  6.4G   0% /sys/fs/cgroup
tmpfs          tmpfs       6.4G   12K  6.4G   1% /var/colab
/dev/sda1      ext4         75G   45G   31G  60% /opt/bin
shm            tmpfs       5.9G  4.0K  5.9G   1% /dev/shm
tmpfs          tmpfs       6.4G     0  6.4G   0% /proc/acpi
tmpfs          tmpfs       6.4G     0  6.4G   0% /proc/scsi
tmpfs          tmpfs       6.4G     0  6.4G   0% /sys/firmware
drive          fuse.drive   15G  7.8G  7.3G  52% /content/drive


In [0]:
!cd /opt/bin ; mkdir -p CAPSTONE/Datasets/Master/Master/ 

In [0]:
!rsync -avzh /content/drive/'My Drive'/CAPSTONE/Datasets/Master/Master /opt/bin/CAPSTONE/Datasets/Master/.

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Master/fkr8k412056525_191724b058.jpg
Master/fkr8k412082368_371df946b3.jpg
Master/fkr8k412101267_7257e6d8c0.jpg
Master/fkr8k412203580_2c7278909c.jpg
Master/fkr8k413231421_43833a11f5.jpg
Master/fkr8k413737417_b0a8b445e9.jpg
Master/fkr8k414568315_5adcfc23c0.jpg
Master/fkr8k414773731_c3f5bf43d5.jpg
Master/fkr8k415118186_64defc96f3.jpg
Master/fkr8k415657941_454d370721.jpg
Master/fkr8k415793623_6c1225ae27.jpg
Master/fkr8k416106657_cab2a107a5.jpg
Master/fkr8k416650559_cd08d3cd96.jpg
Master/fkr8k416788726_5b4eb1466e.jpg
Master/fkr8k416960865_048fd3f294.jpg
Master/fkr8k416992999_654a99a903.jpg
Master/fkr8k417577408_eb571658c1.jpg
Master/fkr8k417966898_a04f9b5349.jpg
Master/fkr8k418357172_bdddf71d32.jpg
Master/fkr8k418616992_22090c6195.jpg
Master/fkr8k418667611_b9995000f4.jpg
Master/fkr8k418796494_bdb441de42.jpg
Master/fkr8k419116771_642800891d.jpg
Master/fkr8k41999070_838089137e.jpg
Master/fkr8k420355149_f2076770df.jpg
Master/fkr8

In [0]:
!df -PTh

Filesystem     Type        Size  Used Avail Use% Mounted on
overlay        overlay      69G   38G   28G  58% /
tmpfs          tmpfs        64M     0   64M   0% /dev
tmpfs          tmpfs       6.4G     0  6.4G   0% /sys/fs/cgroup
tmpfs          tmpfs       6.4G   12K  6.4G   1% /var/colab
/dev/sda1      ext4         75G   46G   29G  62% /opt/bin
shm            tmpfs       5.9G  4.0K  5.9G   1% /dev/shm
tmpfs          tmpfs       6.4G     0  6.4G   0% /proc/acpi
tmpfs          tmpfs       6.4G     0  6.4G   0% /proc/scsi
tmpfs          tmpfs       6.4G     0  6.4G   0% /sys/firmware
drive          fuse.drive   15G  8.7G  6.4G  58% /content/drive


In [0]:
len(listdir(image_folder))

28075

In [0]:
df = pd.DataFrame(listdir(image_folder))

In [0]:
df.count

<bound method DataFrame.count of                                     0
0      fkr8k3459156091_c1879ebe28.jpg
1                fkr30k2570026554.jpg
2              mscoco000000222500.jpg
3             GCC2001291047028341.jpg
4             GCC2001300707374009.jpg
...                               ...
28070   fkr8k281419391_522557ce27.jpg
28071            fkr30k2420516613.jpg
28072            fkr30k2268801262.jpg
28073            fkr30k2354405491.jpg
28074          mscoco000000203345.jpg

[28075 rows x 1 columns]>

In [0]:
# extract features from all images
features_vgg16 = extract_features_1(image_folder,'VGG16')
print('Extracted Features: %d' % len(features_vgg16))
# save to file
dump(features_vgg16, open(feature_file_vgg16, 'wb'))

  0%|          | 0/28075 [00:00<?, ?it/s]

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0   

100%|██████████| 28075/28075 [11:02<00:00, 42.35it/s]


Extracted Features: 28075


In [0]:
# extract features from all images
features_inceptionv3 = extract_features_1(image_folder,'INCEPTIONV3')
print('Extracted Features: %d' % len(features_inceptionv3))
# save to file
dump(features_inceptionv3, open(feature_file_inception, 'wb'))

  0%|          | 0/28075 [00:00<?, ?it/s]

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
conv2d_99 (Conv2D)              (None, 149, 149, 32) 864         input_5[0][0]                    
__________________________________________________________________________________________________
batch_normalization_99 (BatchNo (None, 149, 149, 32) 96          conv2d_99[0][0]                  
__________________________________________________________________________________________________
activation_95 (Activation)      (None, 149, 149, 32) 0           batch_normalization_99[0][0]     
____________________________________________________________________________________________

100%|██████████| 28075/28075 [15:23<00:00, 30.41it/s]


Extracted Features: 28075


In [0]:
# extract features from all images
features_xception = extract_features_1(image_folder,'XCEPTION')
print('Extracted Features: %d' % len(features_xception))
# save to file
dump(features_xception, open(feature_file_xception, 'wb'))

  0%|          | 0/28075 [00:00<?, ?it/s]

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 149, 149, 32) 864         input_6[0][0]                    
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 149, 149, 32) 128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_conv1_act (Activation)   (None, 149, 149, 32) 0           block1_conv1_bn[0][0]            
____________________________________________________________________________________________

100%|██████████| 28075/28075 [12:20<00:00, 39.15it/s]


Extracted Features: 28075
