In [1]:
import os
import random
import zipfile
from pathlib import Path
from functools import partial

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("TensorFlow:", tf.__version__)

TensorFlow: 2.19.0


In [2]:
MODEL_TYPE = "cnn"   # "cnn" or "vit"
BATCH_SIZE = 128
IMG_SIZE = 64        # Tiny ImageNet images are 64x64
EPOCHS = 12          # prueba corta; aumentar si tienes tiempo
AUTOTUNE = tf.data.AUTOTUNE
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [3]:
# Tiny ImageNet (stanford) - contiene train/val/test
DATA_DIR = Path("/content/tiny-imagenet-200")
if not DATA_DIR.exists():
    url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip"
    zip_path = "/content/tiny-imagenet-200.zip"
    print("Descargando Tiny ImageNet (~240MB)...")
    !wget -q --show-progress -O {zip_path} {url}
    print("Extrayendo...")
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall("/content")
    print("Listo.")
else:
    print("Tiny ImageNet ya existe en", DATA_DIR)

Descargando Tiny ImageNet (~240MB)...
Extrayendo...
Listo.


In [4]:
# Para la tarea de rotación vamos a tomar las imágenes y generar 4 rotaciones por imagen.
import glob
from PIL import Image

def list_image_paths(base_dir):
    # listará imágenes de train (cada clase en subcarpeta) y val
    train_dir = base_dir / "train"
    val_dir = base_dir / "val"
    train_paths = glob.glob(str(train_dir / "*"/ "images" / "*.JPEG"))
    val_images = glob.glob(str(val_dir / "images" / "*.JPEG"))
    # También hay validation annotations para labels, pero para pretexto no necesitamos labels
    return train_paths, val_images

train_paths, val_paths = list_image_paths(DATA_DIR)
len(train_paths), len(val_paths)


(100000, 10000)

In [5]:
def preprocess_image(path, img_size=IMG_SIZE):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [img_size, img_size])
    img = tf.cast(img, tf.float32) / 255.0
    return img

# Augmentaciones inspiradas en el artículo (random crop, flip, color jitter, gaussian noise)
def random_augment(image):
    # Random crop + resize (scale + crop)
    image = tf.image.random_crop(tf.image.pad_to_bounding_box(image, 4, 4, IMG_SIZE+8, IMG_SIZE+8), size=[IMG_SIZE, IMG_SIZE, 3])
    image = tf.image.random_flip_left_right(image)
    # Color jitter: brightness, contrast, saturation, hue
    image = tf.image.random_brightness(image, max_delta=0.2)
    image = tf.image.random_contrast(image, 0.8, 1.2)
    image = tf.image.random_saturation(image, 0.8, 1.2)
    image = tf.image.random_hue(image, 0.05)
    # Clip
    image = tf.clip_by_value(image, 0.0, 1.0)
    # Optional gaussian noise
    noise = tf.random.normal(shape=tf.shape(image), mean=0.0, stddev=0.02)
    image = image + noise
    image = tf.clip_by_value(image, 0.0, 1.0)
    return image

def make_rotations(image):
    # Return 4 images and labels 0..3 corresponding to rotation * 90 degrees
    imgs = [
        image,
        tf.image.rot90(image, k=1),
        tf.image.rot90(image, k=2),
        tf.image.rot90(image, k=3),
    ]
    labels = [0,1,2,3]
    return imgs, labels

def dataset_from_paths(paths, batch_size=BATCH_SIZE, training=True):
    # Convert to string tensors (prevents float path errors)
    paths = tf.convert_to_tensor(paths, dtype=tf.string)

    ds = tf.data.Dataset.from_tensor_slices(paths)
    if training:
        ds = ds.shuffle(10000, reshuffle_each_iteration=True)
    ds = ds.map(lambda p: preprocess_image(p), num_parallel_calls=AUTOTUNE)
    if training:
        ds = ds.map(lambda x: random_augment(x), num_parallel_calls=AUTOTUNE)

    def expand_rotations(img):
        imgs, labs = make_rotations(img)
        return tf.data.Dataset.from_tensor_slices((tf.stack(imgs), tf.constant(labs, dtype=tf.int32)))

    ds = ds.flat_map(expand_rotations)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(AUTOTUNE)
    return ds

train_ds = dataset_from_paths(train_paths, training=True)
val_ds = dataset_from_paths(val_paths, training=False)


In [6]:
# Modelos: CNN vanilla y ViT simple
# Encoder outputs a feature vector; final head predicts 4 rotations.

def build_cnn_encoder(input_shape=(IMG_SIZE,IMG_SIZE,3), embedding_dim=256):
    inputs = keras.Input(shape=input_shape)
    x = layers.Conv2D(64, 3, padding="same", activation="relu")(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool2D(2)(x)

    x = layers.Conv2D(128, 3, padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool2D(2)(x)

    x = layers.Conv2D(256, 3, padding="same", activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.MaxPool2D(2)(x)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(embedding_dim, activation="relu")(x)
    x = layers.BatchNormalization()(x)
    model = keras.Model(inputs, x, name="cnn_encoder")
    return model

# Simple Vision Transformer (patchify + transformer blocks)
def build_vit_encoder(input_shape=(IMG_SIZE,IMG_SIZE,3), patch_size=8, num_patches=None,
                      projection_dim=64, transformer_layers=4, num_heads=4, mlp_dim=128):
    if num_patches is None:
        num_patches = (input_shape[0] // patch_size) * (input_shape[1] // patch_size)

    inputs = keras.Input(shape=input_shape)
    # Create patches
    patches = layers.Conv2D(filters=projection_dim, kernel_size=patch_size, strides=patch_size, padding='valid')(inputs)
    # patches shape: (B, H/ps, W/ps, projection_dim) --> flatten to sequence
    shape = tf.shape(patches)
    x = layers.Reshape((num_patches, projection_dim))(patches)
    # Add positional embeddings
    positions = tf.range(start=0, limit=num_patches, delta=1)
    pos_emb = layers.Embedding(input_dim=num_patches, output_dim=projection_dim)(positions)
    x = x + pos_emb
    # Transformer blocks
    for _ in range(transformer_layers):
        # Layer norm + MultiHead
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        attn = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim)(x1, x1)
        x = layers.Add()([attn, x])
        # MLP
        x2 = layers.LayerNormalization(epsilon=1e-6)(x)
        mlp = layers.Dense(mlp_dim, activation='relu')(x2)
        mlp = layers.Dense(projection_dim)(mlp)
        x = layers.Add()([mlp, x])
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(256, activation='relu')(x)
    model = keras.Model(inputs, x, name="vit_encoder")
    return model

# Build final model depending on MODEL_TYPE
if MODEL_TYPE == "cnn":
    encoder = build_cnn_encoder()
elif MODEL_TYPE == "vit":
    encoder = build_vit_encoder()
else:
    raise ValueError("MODEL_TYPE must be 'cnn' or 'vit'")

# Full model: encoder + rotation prediction head
inputs = keras.Input(shape=(IMG_SIZE,IMG_SIZE,3))
z = encoder(inputs)
z = layers.Dropout(0.3)(z)
outputs = layers.Dense(4, activation='softmax')(z)
model = keras.Model(inputs, outputs, name=f"{MODEL_TYPE}_rotation_model")
model.summary()


In [7]:
# Compilar y callbacks
lr = 1e-3
optimizer = keras.optimizers.Adam(learning_rate=lr)
model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

checkpoint_path = f"/content/{MODEL_TYPE}_rotation_checkpoint.h5"
callbacks = [
    keras.callbacks.ModelCheckpoint(checkpoint_path, save_best_only=True, monitor='val_accuracy'),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6),
    keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=6, restore_best_weights=True)
]


In [8]:
# Entrenamiento
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks
)

Epoch 1/12
   3125/Unknown [1m162s[0m 49ms/step - accuracy: 0.4216 - loss: 1.2975



[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 51ms/step - accuracy: 0.4216 - loss: 1.2975 - val_accuracy: 0.5268 - val_loss: 1.1572 - learning_rate: 0.0010
Epoch 2/12
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.5811 - loss: 1.0027



[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 50ms/step - accuracy: 0.5811 - loss: 1.0027 - val_accuracy: 0.5885 - val_loss: 1.0259 - learning_rate: 0.0010
Epoch 3/12
[1m3124/3125[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.6304 - loss: 0.8977



[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 50ms/step - accuracy: 0.6304 - loss: 0.8977 - val_accuracy: 0.5942 - val_loss: 1.0418 - learning_rate: 0.0010
Epoch 4/12
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 50ms/step - accuracy: 0.6601 - loss: 0.8326 - val_accuracy: 0.4892 - val_loss: 2.0035 - learning_rate: 0.0010
Epoch 5/12
[1m3124/3125[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.6796 - loss: 0.7862



[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 63ms/step - accuracy: 0.6796 - loss: 0.7862 - val_accuracy: 0.6547 - val_loss: 0.8981 - learning_rate: 0.0010
Epoch 6/12
[1m3124/3125[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 49ms/step - accuracy: 0.6933 - loss: 0.7525



[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 51ms/step - accuracy: 0.6933 - loss: 0.7525 - val_accuracy: 0.6881 - val_loss: 0.7815 - learning_rate: 0.0010
Epoch 7/12
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.7025 - loss: 0.7292



[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 51ms/step - accuracy: 0.7025 - loss: 0.7292 - val_accuracy: 0.7091 - val_loss: 0.7065 - learning_rate: 0.0010
Epoch 8/12
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 51ms/step - accuracy: 0.7128 - loss: 0.7063 - val_accuracy: 0.7066 - val_loss: 0.7365 - learning_rate: 0.0010
Epoch 9/12
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 50ms/step - accuracy: 0.7212 - loss: 0.6880 - val_accuracy: 0.6715 - val_loss: 0.8577 - learning_rate: 0.0010
Epoch 10/12
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.7285 - loss: 0.6693



[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 54ms/step - accuracy: 0.7285 - loss: 0.6693 - val_accuracy: 0.7425 - val_loss: 0.6236 - learning_rate: 0.0010
Epoch 11/12
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 52ms/step - accuracy: 0.7330 - loss: 0.6554 - val_accuracy: 0.7168 - val_loss: 0.7473 - learning_rate: 0.0010
Epoch 12/12
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.7390 - loss: 0.6440



[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 52ms/step - accuracy: 0.7390 - loss: 0.6440 - val_accuracy: 0.7488 - val_loss: 0.6090 - learning_rate: 0.0010


In [15]:
# Evaluación rápida
val_acc = history.history.get("val_accuracy") or history.history.get("val_acc") or [0]
print("Mejor accuracy val:", max(val_acc))

# Guardar encoder para transfer learning
encoder.save(f"/content/{MODEL_TYPE}_encoder_saved.keras", include_optimizer=False)
print(f"✅ Encoder guardado en /content/{MODEL_TYPE}_encoder_saved")

# (Opcional) prueba de carga
loaded_encoder = tf.keras.models.load_model(f"/content/{MODEL_TYPE}_encoder_saved")
print("Encoder cargado correctamente:", isinstance(loaded_encoder, tf.keras.Model))



Mejor accuracy val: 0.7487999796867371
✅ Encoder guardado en /content/cnn_encoder_saved


ValueError: File format not supported: filepath=/content/cnn_encoder_saved. Keras 3 only supports V3 `.keras` files and legacy H5 format files (`.h5` extension). Note that the legacy SavedModel format is not supported by `load_model()` in Keras 3. In order to reload a TensorFlow SavedModel as an inference-only layer in Keras 3, use `keras.layers.TFSMLayer(/content/cnn_encoder_saved, call_endpoint='serving_default')` (note that your `call_endpoint` might have a different name).

In [16]:
# Ejemplo de uso del encoder para transfer learning
# Supongamos que ahora quieres usar encoder y agregar una cabeza para clasificación
num_new_classes = 200  # en Tiny ImageNet hay 200 clases; esto sería un ejemplo
inputs = keras.Input(shape=(IMG_SIZE,IMG_SIZE,3))
features = encoder(inputs)
x = layers.Dense(512, activation='relu')(features)
x = layers.Dropout(0.3)(x)
out = layers.Dense(num_new_classes, activation='softmax')(x)
transfer_model = keras.Model(inputs, out, name="transfer_model")

# Congelar encoder y compilar
encoder.trainable = False
transfer_model.compile(optimizer=keras.optimizers.Adam(1e-3),
                       loss='sparse_categorical_crossentropy',
                       metrics=['accuracy'])
transfer_model.summary()
# Para entrenar necesitarás etiquetas de Tiny ImageNet (esta celda es demo)
