## Clustering Youtube Thumbnails

sources: https://towardsdatascience.com/how-to-cluster-images-based-on-visual-similarity-cd6e7209fe34

In [1]:
# for loading/processing the images  
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

### Loading Data

In [17]:
path = '/Users/vidya/MRDVGroupProjectCSPB4502/src/Vidya/images'
os.chdir(path)

# this list holds all the image filename
thumbnail_filenames = []

# creates a ScandirIterator aliased as files
with os.scandir(path) as files:
  # loops through each file in the directory
    for file in files:
        if file.name.endswith('.jpg'):
          # adds only the image files to the thumbnail_filenames list
            thumbnail_filenames.append(file.name)

### function to preprocess images and use VGG Model to extract features

In [18]:
# load the model first and pass as an argument
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


### run model on images

In [None]:
data = {}
p = r'/Users/vidya/MRDVGroupProjectCSPB4502/src/Vidya/model/features.pkl'

# loop through each image in the dataset
for i in thumbnail_filenames:
    # try to extract the features and update the dictionary
    try:
        feat = extract_features(i,model)
        data[i] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)

In [None]:
# get a list of the filenames
filenames = np.array(list(data.keys()))

In [None]:
# get a list of just the features
feat = np.array(list(data.values()))
feat.shape

In [None]:
# reshape so that there are 210 samples of 4096 vectors
feat = feat.reshape(-1,4096)
feat.shape