Codigo Adaptado del tutorial: https://github.com/intel-analytics/BigDL-Tutorials/blob/master/notebooks/neural_networks/cnn.ipynb

In [6]:
import os
import numpy as np


# Lista archivos en el directorio de entrada
#input_files = os.listdir('/workspace/input')
#print("Archivos en /workspace/input:", input_files)

In [7]:
from __future__ import print_function
import numpy as np
from bigdl.nn.keras.topology import Sequential
from bigdl.nn.keras.layer import *

In [8]:
from bigdl.nn.criterion import *

In [14]:
import numpy as np
from pyspark import SparkConf, SparkContext
from bigdl.nn.layer import *
from bigdl.optim.optimizer import *
from bigdl.util.common import init_engine, Sample
import datetime as dt
from sklearn.model_selection import train_test_split

In [11]:
import matplotlib

import pandas
import datetime as dt

from bigdl.nn.layer import *
from bigdl.nn.criterion import *
from bigdl.optim.optimizer import *
from bigdl.util.common import *
from bigdl.dataset.transformer import *
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt
from pyspark import SparkContext
sc=SparkContext.getOrCreate(conf=create_spark_conf().setMaster("local[4]").set("spark.driver.memory","2g"))

init_engine()

## SESSION SPARK

In [2]:
import pyspark
print("PySpark version:", pyspark.__version__)

PySpark version: 2.4.3


In [22]:
# Ruta a los archivos .npy
images_path = '/workspace/input/stage2_imagenes.npy'
masks_path = '/workspace/input/stage2_etiquetas.npy'

# Cargar los archivos .npy
train_images = np.load(images_path)
train_masks = np.load(masks_path)

# Verificar las dimensiones de los datos cargados
print(f"train_images shape: {train_images.shape}")
print(f"train_masks shape: {train_masks.shape}")

train_images shape: (5663, 28, 28, 3)
train_masks shape: (5663, 2)


In [23]:
# Convertir etiquetas de one-hot a índices de clase y ajustar el rango a [1, 2]
train_masks = np.argmax(train_masks, axis=1).astype('int64') + 1

# Verificar la conversión
print(f"train_masks shape after conversion: {train_masks.shape}")  # (5663,)
print(f"train_masks unique values: {np.unique(train_masks)}")  

train_masks shape after conversion: (5663,)
train_masks unique values: [1 2]


In [24]:
# b. Reordenar ejes de las imágenes de (N, H, W, C) a (N, C, H, W)
train_images = train_images.transpose(0, 3, 1, 2).astype('float32') / 255.0

print(f"train_images transposed shape: {train_images.shape}")  # (5663, 3, 28, 28)

# c. Dividir en conjuntos de entrenamiento y validación
train_imgs, val_imgs, train_lbls, val_lbls = train_test_split(
    train_images, train_masks, test_size=0.2, random_state=42
)

print(f"train_imgs shape: {train_imgs.shape}")  # (4530, 3, 28, 28)
print(f"train_lbls shape: {train_lbls.shape}")  # (4530,)
print(f"val_imgs shape: {val_imgs.shape}")      # (1133, 3, 28, 28)
print(f"val_lbls shape: {val_lbls.shape}")      # (1133,)

# 3. Inicializar Spark y BigDL
conf = SparkConf().setAppName("LenetTraining").setMaster("local[*]")  # Ajusta según tu entorno
sc = SparkContext.getOrCreate(conf=conf)
init_engine()


train_images transposed shape: (5663, 3, 28, 28)
train_imgs shape: (4530, 3, 28, 28)
train_lbls shape: (4530,)
val_imgs shape: (1133, 3, 28, 28)
val_lbls shape: (1133,)


## MODELO CNN

In [16]:
def build_model(class_num):
    model = Sequential()
    # Cambiamos el número de canales de entrada a 3
    model.add(Reshape([3, 28, 28]))
    # Ajustamos el número de canales de entrada en la primera capa convolucional
    model.add(SpatialConvolution(3, 6, 5, 5).set_name('conv1'))
    model.add(Tanh())
    model.add(SpatialMaxPooling(2, 2, 2, 2).set_name('pool1'))
    model.add(Tanh())
    model.add(SpatialConvolution(6, 12, 5, 5).set_name('conv2'))
    model.add(SpatialMaxPooling(2, 2, 2, 2).set_name('pool2'))
    model.add(Reshape([12 * 4 * 4]))
    model.add(Linear(12 * 4 * 4, 100).set_name('fc1'))
    model.add(Tanh())
    # Cambiamos el número de clases de salida a 2
    model.add(Linear(100, class_num).set_name('score'))
    model.add(LogSoftMax())
    return model

# Construimos el modelo con 2 clases de salida
lenet_model = build_model(2)


creating: createSequential
creating: createReshape
creating: createSpatialConvolution
creating: createTanh
creating: createSpatialMaxPooling
creating: createTanh
creating: createSpatialConvolution
creating: createSpatialMaxPooling
creating: createReshape
creating: createLinear
creating: createTanh
creating: createLinear
creating: createLogSoftMax


In [25]:
# 4. Crear RDDs de muestras

def to_sample(image, label):
    return Sample.from_ndarray(image, label)

train_rdd = sc.parallelize(zip(train_imgs, train_lbls)).map(lambda x: to_sample(x[0], x[1])).cache()
val_rdd = sc.parallelize(zip(val_imgs, val_lbls)).map(lambda x: to_sample(x[0], x[1])).cache()


In [26]:
# 6. Configurar el Optimizer con batch_size=40
optimizer = Optimizer(
    model=lenet_model,
    training_rdd=train_rdd,
    criterion=ClassNLLCriterion(),
    optim_method=SGD(learningrate=0.01, learningrate_decay=0.0002),
    end_trigger=MaxEpoch(20),
    batch_size=40
)

# Configurar la lógica de validación con batch_size=40
optimizer.set_validation(
    batch_size=40,
    val_rdd=val_rdd,
    trigger=EveryEpoch(),
    val_method=[Top1Accuracy()]
)

creating: createClassNLLCriterion
creating: createDefault
creating: createSGD
creating: createMaxEpoch
creating: createDistriOptimizer
creating: createEveryEpoch
creating: createTop1Accuracy


In [27]:
# Configurar los resúmenes de entrenamiento y validación
app_name = 'lenet-' + dt.datetime.now().strftime("%Y%m%d-%H%M%S")
train_summary = TrainSummary(log_dir='/tmp/bigdl_summaries',
                             app_name=app_name)
train_summary.set_summary_trigger("Parameters", SeveralIteration(50))
val_summary = ValidationSummary(log_dir='/tmp/bigdl_summaries',
                                app_name=app_name)
optimizer.set_train_summary(train_summary)
optimizer.set_val_summary(val_summary)

print("Saving logs to", app_name)

creating: createTrainSummary
creating: createSeveralIteration
creating: createValidationSummary
Saving logs to lenet-20241120-144539


In [28]:
# 7. Entrenar el modelo
trained_model = optimizer.optimize()

In [34]:
# 8. Guardar el modelo entrenado
trained_model.save(f"/workspace/output/{app_name}_lenet_model")
print(f"Modelo guardado en /workspace/output/{app_name}_lenet_model")

Modelo guardado en /workspace/output/lenet-20241120-144539_lenet_model
