Importación de datos

In [None]:
import kagglehub
import shutil
import os


# Descargar dataset
path = kagglehub.dataset_download("andrewmvd/lung-and-colon-cancer-histopathological-images")

# Obtener el directorio de trabajo actual
destination_path = os.getcwd()+'/dataset/'
if not os.path.exists(destination_path):
    # Copia la carpeta de forma recursiva
    shutil.copytree(path, destination_path)

Cargar imágenes y etiquetas

In [1]:
import json
import cv2
# Abre el archivo JSON
with open("setup.json") as archivo:
    setup = json.load(archivo)

# Accede a los datos
print(setup['X_resolution_size'])


512


In [2]:
import cv2
import os
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from tqdm import tqdm
import json


# Función para procesar y extraer características en lugar de almacenar imágenes completas
def process_file(file_path, label, resize=True, X_resolution_size= setup['X_resolution_size'], Y_resolution_size= setup['Y_resolution_size'], color_mode="IMREAD_GRAYSCALE"):
    """
    Procesa un archivo de imagen, lo redimensiona y extrae características.
    :param file_path: Ruta del archivo de imagen.
    :param label: Etiqueta de la clase.
    :param resize: Si se debe redimensionar la imagen.
    :param resizeResol: Resolución a la que redimensionar la imagen.
    :param color_mode: Modo de color para cargar la imagen (opciones: cv2.IMREAD_COLOR [RGB], IMREAD_GRAYSCALE [Grayscale]).
    :return: Un diccionario con características y etiqueta.
    """

    try:
        color = cv2.IMREAD_GRAYSCALE  if color_mode == "IMREAD_GRAYSCALE" else cv2.IMREAD_COLOR
        image = cv2.imread(file_path, color)  # Load in GRAY mode
        if resize:
            image = cv2.resize(image, (X_resolution_size, Y_resolution_size))  # Resize to X_resolution_size x Y_resolution_size
        if image is not None:
            return {'features': image.flatten(), 'label': label}  # Store features
        else:
            print(f"Error loading image: {file_path}")
    except Exception as e:
        print(f"Exception processing {file_path}: {e}")
    return None

# Generador para manejar archivos sin retener imágenes completas en memoria
def image_loader(dirs, labels, max_workers=4):
    for dir, label in zip(dirs, labels):
        if not os.path.exists(dir):
            print(f"Directorio no encontrado: {dir}")
            continue

        files = [os.path.join(dir, file) for file in os.listdir(dir)]
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            print(setup['X_resolution_size'])
            results = list(tqdm(executor.map(lambda f: process_file(f, label, True, setup['X_resolution_size'], setup['Y_resolution_size'] ), files), 
                                desc=f"Procesando {dir}", total=len(files)))
        for result in results:
            if result is not None:
                yield result

# Carpeta del dataset
dataset_path = 'dataset/lung_colon_image_set/'

# Definir las subcarpetas de las clases 
dirs = [
    os.path.join(dataset_path, 'lung_image_sets/lung_n'),
    os.path.join(dataset_path, 'colon_image_sets/colon_aca'),
    os.path.join(dataset_path, 'colon_image_sets/colon_n'),
    os.path.join(dataset_path, 'lung_image_sets/lung_aca'),
    os.path.join(dataset_path, 'lung_image_sets/lung_scc')
]

labels = [0, 1, 2, 3, 4]

# Usar el generador optimizado
data = []
for result in image_loader(dirs, labels):
    data.append(result)

# Convertir a un DataFrame más ligero
df = pd.DataFrame(data)


X = list(df['features'])
y = list(df['label'])

512


Procesando dataset/lung_colon_image_set/lung_image_sets/lung_n: 100%|██████████| 5000/5000 [00:03<00:00, 1482.74it/s]


512


Procesando dataset/lung_colon_image_set/colon_image_sets/colon_aca: 100%|██████████| 5000/5000 [00:04<00:00, 1151.81it/s]


512


Procesando dataset/lung_colon_image_set/colon_image_sets/colon_n: 100%|██████████| 5000/5000 [00:05<00:00, 934.59it/s] 


512


Procesando dataset/lung_colon_image_set/lung_image_sets/lung_aca: 100%|██████████| 5000/5000 [00:07<00:00, 701.92it/s]


512


Procesando dataset/lung_colon_image_set/lung_image_sets/lung_scc: 100%|██████████| 5000/5000 [00:09<00:00, 528.80it/s]


In [3]:
df.head()

Unnamed: 0,features,label
0,"[111, 80, 71, 72, 79, 84, 94, 119, 129, 129, 1...",0
1,"[191, 190, 190, 192, 195, 195, 198, 196, 190, ...",0
2,"[203, 203, 203, 203, 203, 204, 204, 204, 204, ...",0
3,"[204, 204, 204, 204, 204, 204, 204, 204, 204, ...",0
4,"[202, 200, 199, 199, 199, 201, 201, 201, 201, ...",0


Preparación de los datos

In [4]:
import numpy as np

# Convertir las etiquetas a categóricas (one-hot encoding) con numpy
train_df['label'] = np.eye(len(labels))[train_df['label']]
val_df['label'] = np.eye(len(labels))[val_df['label']]
test_df['label'] = np.eye(len(labels))[test_df['label']]

# Mostrar las dimensiones de los conjuntos
print("train_df shape:", train_df.shape)
print("val_df shape:", val_df.shape)
print("test_df shape:", test_df.shape)


NameError: name 'train_df' is not defined

Visualizar las imágenes

In [None]:
import matplotlib.pyplot as plt

def visualize_example(x, X_Resolution_size=512, Y_Resolution_size=512, color_mode=cv2.IMREAD_GRAYSCALE):
    """
    Visualiza un ejemplo de imagen a partir de las características extraídas.
    :param x: Características de la imagen.
    :param color_mode: Modo de color para visualizar (opciones: cv2.IMREAD_COLOR [RGB], IMREAD_GRAYSCALE [Grayscale]).
    """
    if color_mode == cv2.IMREAD_COLOR:
        # Reshape the flattened image back to its original dimensions (e.g., 512x512x3 for RGB)
        image = x.reshape(X_Resolution_size, Y_Resolution_size, 3)
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))  # Convert BGR to RGB for correct visualization
    elif color_mode == cv2.IMREAD_GRAYSCALE:
        # Reshape the flattened image back to its original dimensions (e.g., 512x512 for Grayscale)
        image = x.reshape(X_Resolution_size, Y_Resolution_size)
        plt.imshow(image, cmap='gray')  # Use grayscale colormap
    else:
        raise ValueError("Unsupported color mode. Use cv2.IMREAD_COLOR or cv2.IMREAD_GRAYSCALE.")
    
    plt.grid(False)
    plt.show()


In [None]:
# Visualize the first 5 examples in X_test
for i, x in enumerate(test_df[:5]):
    visualize_example(x, X_Resolution_size=setup['X_resolution_size'], Y_Resolution_size=['Y_resolution_size'], color_mode=cv2.IMREAD_GRAYSCALE)
    print(f"Label: {y_test[i]}")  # Print the corresponding label for each image

AttributeError: 'str' object has no attribute 'reshape'

In [None]:
num_classes = len(labels)
# Adjust input shape based on color mode
color_mode = "RGB"  # Change to "Grayscale" for grayscale images
if color_mode == "RGB":
    input_shape = (IMG_SIZE, IMG_SIZE, 3)  # RGB images
else:
    input_shape = (IMG_SIZE, IMG_SIZE, 1)  # Grayscale images

## Modelo de Javi

In [None]:
from keras import layers, Sequential
from keras.regularizers import l2
from keras.optimizers import Adam

filters = 32  # Valor de ejemplo para filtros
kernel_size = (3, 3)  # Valor de ejemplo para el tamaño del kernel

# Ejemplo de una capa convolucional
conv_layer = layers.Conv2D(
    filters, 
    kernel_size, 
    strides=(1, 1), 
    padding='valid', 
    data_format=None, 
    dilation_rate=(1, 1), 
    activation=None, 
    use_bias=True, 
    kernel_initializer='glorot_uniform', 
    bias_initializer='zeros', 
    kernel_regularizer=None, 
    bias_regularizer=None, 
    activity_regularizer=None, 
    kernel_constraint=None, 
    bias_constraint=None    
)

# Construcción del modelo secuencial
IMG_SIZE = 128  # Define el tamaño de las imágenes de entrada
model = Sequential([
    layers.Conv2D(32, kernel_size=(3, 3), data_format="channels_last", activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3), kernel_regularizer=l2(0.01)),
    layers.Conv2D(64, kernel_size=(3, 3), activation='relu', kernel_regularizer=l2(0.01)),
    layers.Conv2D(128, kernel_size=(3, 3), activation='relu', kernel_regularizer=l2(0.01)),
    layers.Flatten(),
    layers.Dense(256, activation='relu', kernel_regularizer=l2(0.01)),
    layers.Dropout(0.5),
    layers.Dense(6, activation='softmax')
])

# Resumen del modelo
model.summary()

# Compilación del modelo
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
from keras.utils import to_categorical
from keras.layers import Dense


# Entrenamiento del modelo
history = model.fit(
    train_df['features'].tolist(),
    train_df['label'].tolist(),
    epochs=15,
    validation_data=(val_df['features'].tolist(), y_val['label'].tolist()),
    batch_size=32,  # Especificar el tamaño del batch (opcional)
    verbose=1       # Ajustar el nivel de verbosidad (opcional)
)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

## Modelo David

In [None]:
# Cargamos un modelo pre-entrenado con Imagenet
vgg_model = VGG19(include_top=False, weights="imagenet", input_shape=input_shape)

# Congelamos al modelo / que los parámetros no se actualicen
for layer in vgg_model.layers[:12]:
  layer.trainable = False

In [None]:
model = Sequential()
model.add(vgg_model)
model.add(Flatten())
model.add(Dense(units=500, activation="relu"))
model.add(Dense(units=num_classes,   activation="softmax"))
model.compile(optimizer='sgd', # Mejor optimizador
              loss="categorical_crossentropy",
              metrics=["acc"])
history=model.fit(X_train, y_train, epochs=epochs, validation_split = 0.2)

In [None]:
def plot_acc(history, title="Model Accuracy"):
  """Imprime una gráfica mostrando la accuracy por epoch obtenida en un entrenamiento"""
  epochs_range = np.arange(1, len(history.history['acc']) + 1)
  plt.plot(epochs_range,history.history['acc'])
  plt.plot(epochs_range,history.history['val_acc'])
  plt.title(title)
  plt.ylabel('Accuracy')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Val'], loc='upper left')
  plt.show()
def plot_loss(history, title="Model Loss"):
  """Imprime una gráfica mostrando la pérdida por epoch obtenida en un entrenamiento"""
  epochs_range = np.arange(1, len(history.history['acc']) + 1)
  plt.plot(epochs_range,history.history['loss'])
  plt.plot(epochs_range,history.history['val_loss'])
  plt.title(title)
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Val'], loc='upper right')
  plt.show()

In [None]:
plot_acc(history)

In [None]:
plot_loss(history)

## Modelo Erik

In [None]:
# Crear un modelo simple de CNN para clasificación de imágenes
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(len(train_generator.class_indices), activation='softmax')  # Número de clases (6)
])

# Compilar el modelo
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Entrenar el modelo
history = model.fit(
    train_generator,
    epochs=10,
    validation_data=val_generator
)

# Extraer métricas del historial
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

# Número de épocas
epochs = range(1, len(train_loss) + 1)

# Crear las gráficas
plt.figure(figsize=(12, 5))

# Gráfica de pérdida (Loss)
plt.subplot(1, 2, 1)
plt.plot(epochs, train_loss, label='Train Loss', marker='o')
plt.plot(epochs, val_loss, label='Validation Loss', marker='o')
plt.title('Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid()

# Gráfica de exactitud (Accuracy)
plt.subplot(1, 2, 2)
plt.plot(epochs, train_acc, label='Train Accuracy', marker='o')
plt.plot(epochs, val_acc, label='Validation Accuracy', marker='o')
plt.title('Accuracy over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()
