In [1]:
# for loading/processing the images
# from keras.preprocessing.image import load_img
# from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from tensorflow.keras.utils  import load_img
from tensorflow.keras.utils import img_to_array

# models
from keras.applications.vgg16 import VGG16
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle
from zipfile import ZipFile
from google.colab import drive
from io import BytesIO

In [2]:
drive.mount('/content/drive')
ARCHIVO = ('/content/drive/MyDrive/DMCT/Rice_Dataset.zip')

fotos_zip = ZipFile(ARCHIVO)
lista_imágenes = fotos_zip.namelist()
serie_imagenes = pd.Series(lista_imágenes)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

In [4]:
def get_labels_and_paths(input_folder):
   return sorted([(dp.split("/")[1],os.path.join(dp, f)) for dp, dn, filenames in os.walk(input_folder) for f in filenames if os.path.splitext(f)[1] == '.jpg'])


def extract_features(file, model, zip):

    # levanta imagen como array 224x224
    img = load_img(BytesIO(zip.read(file)) , target_size=(224,224))
    # convierte img a numpy array (originalmente es 'PIL.Image.Image')
    img = np.array(img)
    # reshape para tener formato necesario para el modelo (num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3)
    # prepara imagen para modelo (función de keras)
    imgx = preprocess_input(reshaped_img)
    # extrae features
    features = model.predict(imgx, use_multiprocessing=True, verbose=0)
    return features

from tqdm import tqdm
def preprocess(names_and_paths, model, zip):
    preprocessed_data = {}
    for name, path in tqdm(names_and_paths):
    #   print(path)
      featuress = extract_features(path, model, zip)
      preprocessed_data[path] = {'label':name,
                    'features':featuress}
      #print(f"Extracción features de {path}")
    return preprocessed_data


# function that lets you view a cluster (based on identifier)
def view_cluster(gps,cluster):
    plt.figure(figsize = (25,25));
    # gets the list of filenames for a cluster
    files = gps
    # only allow up to 30 images to be shown at a time
    if len(files) > 10:
        print(f"Clipping cluster size from {len(files)} to 10")
        files = files[:9]
    # plot each image in the cluster
    for index, file in enumerate(files):
        plt.subplot(1,10,index+1);
        img = load_img(file)
        img = np.array(img)
        plt.imshow(img)
        plt.title('Cluster n:' + str(cluster))
        plt.axis('off')

In [5]:
ser_arb = serie_imagenes[serie_imagenes.str.startswith('Rice_Image_Dataset/Arborio/A')]
ser_bas = serie_imagenes[serie_imagenes.str.startswith(('Rice_Image_Dataset/Basmati/b','Rice_Image_Dataset/Basmati/B'))]
ser_ips = serie_imagenes[serie_imagenes.str.startswith('Rice_Image_Dataset/Ipsala/I')]
ser_jas = serie_imagenes[serie_imagenes.str.startswith('Rice_Image_Dataset/Jasmine/J')]
ser_kar = serie_imagenes[serie_imagenes.str.startswith('Rice_Image_Dataset/Karacadag/K')]
del lista_imágenes, serie_imagenes

In [6]:
def label_path(label, paths):
    return [(label, path) for path in paths]

#names_and_paths = get_labels_and_paths(input_folder=ARCHIVO)
names_and_paths = (label_path('Arborio', ser_arb) +
                    label_path('Basmati', ser_bas) +
                    label_path('Ipsala', ser_ips) +
                    label_path('Jasmine', ser_jas) +
                    label_path('Karacadag', ser_kar))


import random
from collections import Counter
from itertools import groupby

# Contar las clases
class_counts = Counter(item[0] for item in names_and_paths)

# Calcular la cantidad mínima de elementos por clase
min_count = 10000

# Crear una lista de elementos por clase
grouped_data = {key: [item for item in names_and_paths if item[0] == key] for key in class_counts.keys()}

# Tomar una muestra estratificada
stratified_sample = []
for key, group in grouped_data.items():
    random.shuffle(group)  # Mezcla los elementos para obtener una muestra aleatoria
    stratified_sample.extend(group[:min_count])  # Agrega la misma cantidad de elementos por clase


In [7]:
class_counts

Counter({'Arborio': 15000,
         'Basmati': 15000,
         'Ipsala': 15000,
         'Jasmine': 15000,
         'Karacadag': 15000})

In [None]:
data_dict = preprocess(names_and_paths=stratified_sample, model=model, zip=fotos_zip)

 16%|█▌        | 8068/50000 [17:10<3:09:35,  3.69it/s]

In [None]:
labels =  [data_dict[k]['label'] for k in data_dict.keys()]
features = [data_dict[k]['features'][0] for k in data_dict.keys()]
features = np.array(features)

In [None]:
features

In [None]:
df_features = pd.DataFrame(features)
df_features.to_csv('/content/drive/MyDrive/DMCT/features.csv', index=False)

In [None]:
# reduce the amount of dimensions in the feature vector
pca = PCA(n_components=100, random_state=22)
pca.fit(features)
x = pca.transform(features)