This notebook will do the preprocessing in the images and then apply the necessary feature extraction technique
After that the clustering models are applied 

In [2]:
import numpy as np
import os
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2
from tensorflow.keras.applications.densenet import DenseNet201
from tensorflow.keras.models import Model

from tensorflow.keras.applications.vgg16 import preprocess_input

from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN,OPTICS,MeanShift
import os, shutil, glob, os.path
from PIL import Image as pil_image
from matplotlib import pyplot as plt
from pylab import *

import sklearn
from sklearn.manifold import TSNE
import time

from sklearn.decomposition import PCA 

from sklearn.metrics import pairwise_distances
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import davies_bouldin_score


import warnings
warnings.filterwarnings('ignore')

In [2]:

%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

Transfer learning is adaptation more than creation. A model is not created from scratch but a pre-trained model is just adapted to a new problem. Given a small dataset which is not sufficient to build a DL model from scratch, then transfer learning is the option to automatically extract the features, we take the advantage of these learned feature maps without having to start from scratch by training a large model on a large dataset. We can in general extract the features using the following two cases:

# CASE-1


Extracting the features from the image using the pretrained models , performing the pooling operation to the output of the 
last convolution layer . See the model_vgg_16.summary() the (None,None,512) is pooled .

In [None]:


model_vgg_16=VGG16(weights='imagenet', include_top=False,pooling='avg')
model_vgg_19=VGG19(weights='imagenet', include_top=False,pooling='avg')
model_resnet50=ResNet50(weights='imagenet', include_top=False,pooling='avg')

model_inceptionv3=InceptionV3(weights='imagenet', include_top=False,pooling='avg')

In [9]:
 model_vgg_16.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

# CASE-2

The last few layers of the VGG16(for example) model are fully connected layers prior to the output layer. These layers will provide a complex set of features to describe a given input image and may provide useful input when training a new model for image classification or related computer vision task.

In [None]:
model_vgg_16_fcn = VGG16()

model_vgg_16_fcn.layers.pop()
model_vgg_16_fcn= Model(inputs=model_vgg_16_fcn.inputs, outputs=model_vgg_16_fcn.layers[-2].output)


model_vgg_19_fcn=VGG19()
model_vgg_19_fcn.layers.pop()
model_vgg_19_fcn= Model(inputs=model_vgg_19_fcn.inputs, outputs=model_vgg_19_fcn.layers[-2].output)


model_resnet50_fcn=ResNet50()
model_resnet50_fcn.layers.pop()
model_resnet50_fcn= Model(inputs=model_resnet50_fcn.inputs, outputs=model_resnet50_fcn.layers[-2].output)


model_inceptionV3_fcn=InceptionV3()
model_inceptionV3_fcn.layers.pop()
model_inceptionV3_fcn= Model(inputs=model_inceptionV3_fcn.inputs, outputs=model_inceptionV3_fcn.layers[-2].output)




In [33]:
model_vgg_19_fcn.summary()

Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0   

Loading the filepaths here

In [34]:
def loadFilePaths(image_directory):
    
    files=os.listdir(image_directory)
    files_path=[os.path.join(image_directory,file) for file in files ]
    return files_path


In [35]:
def visualizeDataset2(files_path):
    subplots_adjust(hspace=0.000)
    number_of_subplots=3
    for i,v in enumerate(range(len(files_path))):
        v = v+1
        image = pil_image.open(files_path[i])
        
        ax1 = subplot(len(files_path),3,v)
        
        ax1.axis('off')
        #ax1.figure.set_size_inches(10,15)
        ax1.imshow(image, cmap="gray", aspect="auto")

In [36]:
def visualizeDataset1(file_paths):
    rows=2
    for num, x in tqdm(enumerate(file_paths[0:12])):
            img = pil_image.open(x)
            plt.subplot(rows,6,num+1)
            #plt.title(x.split('.')[0])
            plt.axis('off')
            plt.imshow(img)

extracting the features out of a pretrained model , features from each pretrained model is extracted.

In [93]:
 # the function is used to calculate the features from the image 
def getFeatures(filelist,model): 
    filelist.sort()
    featurelist = []
    for i, imagepath in enumerate(filelist):
    #for i in tqdm(range(len(filelist))):
        print(" Status: %s / %s" %(i, len(filelist)), end="\r")
        img = image.load_img(filelist[i], target_size=(224, 224))
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img_data, axis=0)
        img_data = preprocess_input(img_data)
        features = np.array(model.predict(img_data))
        featurelist.append(features.flatten())
    return featurelist



In [6]:
def saveFeatures(features,modelname,filename):
    saved_filename=filename+'_'+modelname
    saved_filename=os.path.join('Image_features_pca',saved_filename)

    print("saving",saved_filename+'.npy')
    np.save(saved_filename+'.npy',features)

In [95]:
ideology_files_path=loadFilePaths('ideology_image_dataset/')
muslim_files_path=loadFilePaths('muslim_image_dataset/')


In [102]:

ideology_features=getFeatures(ideology_files_path,model_inceptionV3_fcn)
muslim_features=getFeatures(muslim_files_path,model_inceptionV3_fcn)


 Status: 241 / 24242

In [104]:
saveFeatures(ideology_features,'model_inceptionV3_fcn','ideology')
saveFeatures(muslim_features,'model_inceptionV3_fcn','muslim')

saving Image_features/ideology_model_inceptionV3_fcn.npy
saving Image_features/muslim_model_inceptionV3_fcn.npy


# Dimensionality Reduction

Using pca for the dimensionality reduction technique, clustering in general performs better with less dimensionality

In [4]:

def pca_transform(train_data):
    pca = PCA(n_components = 100,random_state=728) 
    X_principal = pca.fit_transform(train_data) 
  #  X_principal = pd.DataFrame(X_principal) 
    return X_principal


In [8]:
for file in os.listdir('Image_features/'):
    print(file)
    features=np.load(os.path.join('Image_features',file))
    features_pca=pca_transform(features)
    saveFeatures(features_pca,file.split('.')[0]+'_pca',file.split('_')[0])
   # features_tsne=tsne_transform(features)
   # saveFeatures(features_pca,file.split('.')[0]+'_tsne',file.split('_')[0])
    
   # print(files.split('.')[0])