In [1]:
import os

In [2]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"

In [5]:
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint
import numpy as np
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical
from tensorflow import keras as k

In [6]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mshah-harsh8[0m ([33mshah-harsh8-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
class LogLRCallback(k.callbacks.Callback):
    """Log optimizer learning rate each epoch."""
    def on_epoch_end(self, epoch, logs=None):
        opt = self.model.optimizer
        lr = float(k.backend.get_value(opt.learning_rate))
        wandb.log({"learning_rate": lr}, step=self.model.optimizer.iterations.numpy())

class LogSamplesCallback(k.callbacks.Callback):
    """Log a small table of predictions + images every epoch."""
    def __init__(self, x, y, labels, max_rows=16):
        super().__init__()
        self.x = x[:max_rows]
        self.y = y[:max_rows]
        self.labels = labels

    def on_epoch_end(self, epoch, logs=None):
        preds = self.model.predict(self.x, verbose=0)
        y_true = np.argmax(self.y, axis=1)
        y_pred = np.argmax(preds, axis=1)
        table = wandb.Table(columns=["image", "y_true", "y_pred", "correct", "p(y_pred)"])
        for i in range(len(self.x)):
            table.add_data(
                wandb.Image(self.x[i]),
                self.labels[y_true[i]],
                self.labels[y_pred[i]],
                bool(y_true[i] == y_pred[i]),
                float(np.max(preds[i])),
            )
        wandb.log({f"samples/epoch_{epoch+1}": table})

class ConfusionMatrixCallback(k.callbacks.Callback):
    """Log a confusion matrix from the full validation set each epoch."""
    def __init__(self, x_val, y_val, labels):
        super().__init__()
        self.x_val = x_val
        self.y_val = y_val
        self.labels = labels

    def on_epoch_end(self, epoch, logs=None):
        preds = self.model.predict(self.x_val, verbose=0)
        y_true = np.argmax(self.y_val, axis=1)
        y_pred = np.argmax(preds, axis=1)
        cm_plot = wandb.plot.confusion_matrix(
            probs=None, y_true=y_true, preds=y_pred, class_names=self.labels
        )
        wandb.log({"confusion_matrix": cm_plot})


# --- Trainer -----------------------------------------------------------------

class CIFAR10Trainer:
    def __init__(self, project_name="Lab2-CIFAR10-Advanced", run_name="cnn_plus"):
        self.cfg = dict(
            dropout=0.3,
            conv1_filters=32,
            conv2_filters=64,
            dense_size=128,
            learn_rate=0.001,
            epochs=5,
            batch_size=64,
        )

        self.run = wandb.init(
            project=project_name,
            name=run_name,
            config=self.cfg,
            settings=wandb.Settings(start_method="thread"),
        )
        self.config = wandb.config
        self.labels = [
            "airplane", "automobile", "bird", "cat", "deer",
            "dog", "frog", "horse", "ship", "truck"
        ]
        self._prepare_data()

    def _prepare_data(self):
        (x_train, y_train), (x_test, y_test) = cifar10.load_data()
        x_train, x_test = x_train.astype("float32")/255.0, x_test.astype("float32")/255.0
        y_train, y_test = to_categorical(y_train), to_categorical(y_test)
        self.X_train, self.X_test = x_train, x_test
        self.y_train, self.y_test = y_train, y_test

    def _build_model(self):
        inputs = k.Input(shape=(32, 32, 3))
        x = k.layers.Conv2D(self.config.conv1_filters, (3,3), activation="relu", padding="same")(inputs)
        x = k.layers.Conv2D(self.config.conv2_filters, (3,3), activation="relu", padding="same")(x)
        x = k.layers.MaxPooling2D((2,2))(x)
        x = k.layers.Dropout(self.config.dropout)(x)
        x = k.layers.Flatten()(x)
        x = k.layers.Dense(self.config.dense_size, activation="relu")(x)
        outputs = k.layers.Dense(10, activation="softmax")(x)

        model = k.Model(inputs, outputs)

        opt = k.optimizers.Adam(learning_rate=self.config.learn_rate)
        model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
        return model

    def _log_model_artifact(self, model):
        os.makedirs("artifacts", exist_ok=True)
        model_path = "artifacts/cifar10_model.keras"
        model.save(model_path)
        art = wandb.Artifact("cifar10_model", type="model")
        art.add_file(model_path)
        self.run.log_artifact(art)

    def train(self):
        model = self._build_model()

        callbacks = [
            WandbMetricsLogger(log_freq=10),
            WandbModelCheckpoint("checkpoints/model-{epoch:02d}.keras", save_weights_only=False),
            LogLRCallback(),
            LogSamplesCallback(self.X_test, self.y_test, self.labels, max_rows=16),
            ConfusionMatrixCallback(self.X_test, self.y_test, self.labels),
            k.callbacks.ReduceLROnPlateau(factor=0.5, patience=2, verbose=1),
            k.callbacks.EarlyStopping(patience=3, restore_best_weights=True, verbose=1),
        ]

        model.fit(
            self.X_train, self.y_train,
            validation_data=(self.X_test, self.y_test),
            epochs=self.config.epochs,
            batch_size=self.config.batch_size,
            callbacks=callbacks,
            verbose=1,
        )

        loss, acc = model.evaluate(self.X_test, self.y_test, verbose=0)
        wandb.log({"final/loss": loss, "final/accuracy": acc})
        self._log_model_artifact(model)
        self.run.finish()


CIFAR10Trainer().train()



Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 107ms/step - accuracy: 0.4288 - loss: 1.5940 - val_accuracy: 0.6278 - val_loss: 1.0708 - learning_rate: 0.0010
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 119ms/step - accuracy: 0.6484 - loss: 0.9959 - val_accuracy: 0.6591 - val_loss: 0.9781 - learning_rate: 0.0010
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 104ms/step - accuracy: 0.7076 - loss: 0.8465 - val_accuracy: 0.6706 - val_loss: 0.9415 - learning_rate: 0.0010
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 97ms/step - accuracy: 0.7458 - loss: 0.7274 - val_accuracy: 0.6910 - val_loss: 0.9030 - learning_rate: 0.0010
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 92ms/step - accuracy: 0.7799 - loss: 0.6240 - val_accuracy: 0.7034 - val_loss: 0.8760 - learning_rate: 0.0010
Restoring model weights from the end of the best epoch: 5.


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
batch/accuracy,▁▄▅▅▅▅▅▅▅▇▇▇▇▇▇▇▇▇▇▇▇███████████████████
batch/batch_step,▁▁▁▂▂▂▂▂▃▃▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇█
batch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
batch/loss,█▆▆▆▅▅▅▅▅▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▁
epoch/accuracy,▁▅▆▇█
epoch/epoch,▁▃▅▆█
epoch/learning_rate,▁▁▁▁▁
epoch/loss,█▄▃▂▁
epoch/val_accuracy,▁▄▅▇█
epoch/val_loss,█▅▃▂▁

0,1
batch/accuracy,0.77485
batch/batch_step,3940
batch/learning_rate,0.001
batch/loss,0.6388
epoch/accuracy,0.7748
epoch/epoch,4
epoch/learning_rate,0.001
epoch/loss,0.63896
epoch/val_accuracy,0.7034
epoch/val_loss,0.87597
