In [1]:
# for loading/processing the images
# from keras.preprocessing.image import load_img
# from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from tensorflow.keras.utils  import load_img
from tensorflow.keras.utils import img_to_array

# models
from keras.applications.vgg16 import VGG16
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

2023-09-23 15:40:55.115419: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-23 15:40:55.152867: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-23 15:40:55.395830: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-23 15:40:55.397939: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

In [26]:
def get_labels_and_paths(input_folder):
   return [(dp.split("/")[1],os.path.join(dp, f)) for dp, dn, filenames in os.walk('input') for f in filenames if os.path.splitext(f)[1] == '.jpg']  


def extract_features(file, model):
    # levanta imagen como array 224x224
    img = load_img(file, target_size=(224,224))
    # convierte img a numpy array (originalmente es 'PIL.Image.Image')
    img = np.array(img)
    # reshape para tener formato necesario para el modelo (num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3)
    # prepara imagen para modelo (función de keras)
    imgx = preprocess_input(reshaped_img)
    # extrae features
    features = model.predict(imgx, use_multiprocessing=True)
    return features
    
def preprocess(names_and_paths, model):
    preprocessed_data = {}
    for name, path in names_and_paths:
    #   print(path)
      featuress = extract_features(path, model)
      preprocessed_data[path] = {'label':name,
                    'features':featuress}
      print(f"Extracción features de {path}")
    return preprocessed_data


# function that lets you view a cluster (based on identifier)
def view_cluster(gps,cluster):
    plt.figure(figsize = (25,25));
    # gets the list of filenames for a cluster
    files = gps
    # only allow up to 30 images to be shown at a time
    if len(files) > 10:
        print(f"Clipping cluster size from {len(files)} to 10")
        files = files[:9]
    # plot each image in the cluster
    for index, file in enumerate(files):
        plt.subplot(1,10,index+1);
        img = load_img(file)
        img = np.array(img)
        plt.imshow(img)
        plt.title('Cluster n:' + str(cluster))
        plt.axis('off')




In [5]:
names_and_paths = get_labels_and_paths(input_folder='input')

In [27]:
data_dict = preprocess(names_and_paths=names_and_paths, model=model)

Extracción features de input/Basmati/basmati (383).jpg
Extracción features de input/Basmati/basmati (3937).jpg
Extracción features de input/Basmati/basmati (12949).jpg
Extracción features de input/Basmati/basmati (7857).jpg
Extracción features de input/Basmati/basmati (8806).jpg
Extracción features de input/Basmati/basmati (7474).jpg
Extracción features de input/Basmati/basmati (2413).jpg
Extracción features de input/Basmati/basmati (12136).jpg
Extracción features de input/Basmati/basmati (1485).jpg
Extracción features de input/Basmati/basmati (5993).jpg
Extracción features de input/Basmati/basmati (10158).jpg
Extracción features de input/Basmati/basmati (7797).jpg
Extracción features de input/Basmati/basmati (12765).jpg
Extracción features de input/Basmati/basmati (12470).jpg
Extracción features de input/Basmati/basmati (3643).jpg
Extracción features de input/Basmati/basmati (8945).jpg
Extracción features de input/Basmati/basmati (10433).jpg
Extracción features de input/Basmati/basmat

In [None]:
labels =  [data_dict[k]['label'] for k in data_dict.keys()]
features = [data_dict[k]['features'] for k in data_dict.keys()]
features.shape

In [None]:
# reduce the amount of dimensions in the feature vector
pca = PCA(n_components=100, random_state=22)
pca.fit(features)
x = pca.transform(features)