In [2]:
# for loading/processing the images
# from keras.preprocessing.image import load_img
# from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from tensorflow.keras.utils  import load_img
from tensorflow.keras.utils import img_to_array

# models
from keras.applications.vgg16 import VGG16
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

2023-09-24 18:55:35.866387: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-24 18:55:35.902832: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-24 18:55:36.163092: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-24 18:55:36.165325: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

2023-09-24 18:55:40.853109: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-09-24 18:55:40.853473: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [4]:
def get_labels_and_paths(input_folder):
   return sorted([(dp.split("/")[1],os.path.join(dp, f)) for dp, dn, filenames in os.walk('input') for f in filenames if os.path.splitext(f)[1] == '.jpg'])  


def extract_features(file, model):
    # levanta imagen como array 224x224
    img = load_img(file, target_size=(224,224))
    # convierte img a numpy array (originalmente es 'PIL.Image.Image')
    img = np.array(img)
    # reshape para tener formato necesario para el modelo (num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3)
    # prepara imagen para modelo (función de keras)
    imgx = preprocess_input(reshaped_img)
    # extrae features
    features = model.predict(imgx, use_multiprocessing=True)
    return features
    
def preprocess(names_and_paths, model):
    preprocessed_data = {}
    for name, path in names_and_paths:
    #   print(path)
      featuress = extract_features(path, model)
      preprocessed_data[path] = {'label':name,
                    'features':featuress}
      print(f"Extracción features de {path}")
    return preprocessed_data


# function that lets you view a cluster (based on identifier)
def view_cluster(gps,cluster):
    plt.figure(figsize = (25,25));
    # gets the list of filenames for a cluster
    files = gps
    # only allow up to 30 images to be shown at a time
    if len(files) > 10:
        print(f"Clipping cluster size from {len(files)} to 10")
        files = files[:9]
    # plot each image in the cluster
    for index, file in enumerate(files):
        plt.subplot(1,10,index+1);
        img = load_img(file)
        img = np.array(img)
        plt.imshow(img)
        plt.title('Cluster n:' + str(cluster))
        plt.axis('off')

In [7]:
names_and_paths = get_labels_and_paths(input_folder='input')

import random
from collections import Counter
from itertools import groupby

# Contar las clases
class_counts = Counter(item[0] for item in names_and_paths)

# Calcular la cantidad mínima de elementos por clase
min_count = 100

# Crear una lista de elementos por clase
grouped_data = {key: [item for item in names_and_paths if item[0] == key] for key in class_counts.keys()}

# Tomar una muestra estratificada
stratified_sample = []
for key, group in grouped_data.items():
    random.shuffle(group)  # Mezcla los elementos para obtener una muestra aleatoria
    stratified_sample.extend(group[:min_count])  # Agrega la misma cantidad de elementos por clase


In [10]:
class_counts

Counter({'Arborio': 15000,
         'Basmati': 15000,
         'Ipsala': 15000,
         'Jasmine': 15000,
         'Karacadag': 15000})

In [5]:
data_dict = preprocess(names_and_paths=names_and_paths, model=model)

Extracción features de input/Arborio/Arborio (1).jpg
Extracción features de input/Arborio/Arborio (10).jpg
Extracción features de input/Arborio/Arborio (100).jpg
Extracción features de input/Arborio/Arborio (1000).jpg
Extracción features de input/Arborio/Arborio (10000).jpg
Extracción features de input/Arborio/Arborio (10001).jpg
Extracción features de input/Arborio/Arborio (10002).jpg
Extracción features de input/Arborio/Arborio (10003).jpg
Extracción features de input/Arborio/Arborio (10004).jpg
Extracción features de input/Arborio/Arborio (10005).jpg
Extracción features de input/Arborio/Arborio (10006).jpg
Extracción features de input/Arborio/Arborio (10007).jpg
Extracción features de input/Arborio/Arborio (10008).jpg
Extracción features de input/Arborio/Arborio (10009).jpg
Extracción features de input/Arborio/Arborio (1001).jpg
Extracción features de input/Arborio/Arborio (10010).jpg
Extracción features de input/Arborio/Arborio (10011).jpg
Extracción features de input/Arborio/Arbor

: 

: 

In [None]:
labels =  [data_dict[k]['label'] for k in data_dict.keys()]
features = [data_dict[k]['features'] for k in data_dict.keys()]
features = np.array(features)

In [None]:
# reduce the amount of dimensions in the feature vector
pca = PCA(n_components=100, random_state=22)
pca.fit(features)
x = pca.transform(features)