# Optimization Dynamics Study in CNNs
## An Empirical Investigation of Optimizer Behavior on CIFAR-10

In this study, I analyze how optimization strategy influences:
- Convergence speed
- Stability
- Generalization gap
- Final performance

The architecture is kept constant while only optimization dynamics are varied.

# Imports and Reproducibility

In [1]:
# Core libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import random

from sklearn.metrics import confusion_matrix, classification_report

# Reproducibility
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.19.0


## Dataset: CIFAR-10

- 60,000 RGB images (32Ã—32)
- 10 object classes
- 50,000 training / 10,000 testing

Images are normalized to stabilize gradient updates.
Labels are one-hot encoded for multi-class classification.

# Load and Preprocess Data

In [None]:
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()

# Normalize pixel values
x_train = x_train.astype("float32") / 255.0
x_test = x_test.astype("float32") / 255.0

# One-hot encoding
num_classes = 10
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

class_names = [
    "airplane","automobile","bird","cat","deer",
    "dog","frog","horse","ship","truck"
]

print("Training shape:", x_train.shape)
print("Test shape:", x_test.shape)

## Deep CNN Architecture (Controlled Variable)

The architecture remains fixed across experiments.

Regularization techniques:
- Batch Normalization
- Dropout
- L2 Weight Decay

Only optimizer behavior will change.

# Model Builder

In [3]:
def build_model(l2_reg=0.0005, dropout_rate=0.5):
    model = keras.Sequential([
        layers.Conv2D(32,(3,3),padding="same",activation="relu",
                      kernel_regularizer=keras.regularizers.l2(l2_reg),
                      input_shape=(32,32,3)),
        layers.BatchNormalization(),
        layers.Conv2D(32,(3,3),activation="relu",
                      kernel_regularizer=keras.regularizers.l2(l2_reg)),
        layers.MaxPooling2D((2,2)),
        layers.Dropout(0.25),

        layers.Conv2D(64,(3,3),padding="same",activation="relu",
                      kernel_regularizer=keras.regularizers.l2(l2_reg)),
        layers.BatchNormalization(),
        layers.Conv2D(64,(3,3),activation="relu",
                      kernel_regularizer=keras.regularizers.l2(l2_reg)),
        layers.MaxPooling2D((2,2)),
        layers.Dropout(0.25),

        layers.Flatten(),
        layers.Dense(512,activation="relu"),
        layers.Dropout(dropout_rate),
        layers.Dense(10,activation="softmax")
    ])
    return model

## Unified Training Procedure

To ensure controlled comparison:
- Same batch size
- Same epoch limit
- Same validation split
- Early stopping enabled

Metrics tracked:
- Train accuracy
- Validation accuracy
- Test accuracy
- Generalization gap
- Training time

# Training Framework

In [4]:
results_log = []

def train_experiment(name, optimizer, lr, momentum=None, weight_decay=None):

    model = build_model()

    model.compile(
        optimizer=optimizer,
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )

    early_stop = keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,
        restore_best_weights=True
    )

    start_time = time.time()

    history = model.fit(
        x_train, y_train,
        epochs=40,
        batch_size=128,
        validation_split=0.2,
        callbacks=[early_stop],
        verbose=1
    )

    training_time = time.time() - start_time

    test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)

    train_acc = history.history["accuracy"][-1]
    val_acc = history.history["val_accuracy"][-1]
    gap = train_acc - val_acc

    results_log.append({
        "Optimizer": name,
        "Learning Rate": lr,
        "Momentum": momentum,
        "Weight Decay": weight_decay,
        "Train Acc": train_acc,
        "Val Acc": val_acc,
        "Generalization Gap": gap,
        "Test Acc": test_acc,
        "Training Time (s)": training_time
    })

    print(f"\n{name}")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Generalization Gap: {gap:.4f}")
    print(f"Training Time: {training_time:.2f}s")

    return history, model

## Optimization Experiments

I evaluate:
- SGD (LR sensitivity)
- SGD + Momentum
- Adam
- AdamW

# SGD (LR Sensitivity)

In [None]:
history_sgd_01, model_sgd_01 = train_experiment(
    "SGD (lr=0.01)",
    keras.optimizers.SGD(learning_rate=0.01),
    lr=0.01
)


# SGD + Momentum

In [None]:
history_momentum, model_momentum = train_experiment(
    "SGD + Momentum",
    keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
    lr=0.01,
    momentum=0.9
)

# Adam

In [None]:
history_adam, model_adam = train_experiment(
    "Adam",
    keras.optimizers.Adam(learning_rate=0.001),
    lr=0.001
)

# AdamW

In [None]:
history_adamw, model_adamw = train_experiment(
    "AdamW",
    keras.optimizers.AdamW(learning_rate=0.001, weight_decay=1e-4),
    lr=0.001,
    weight_decay=1e-4
)

# Results Table

In [None]:
results_df = pd.DataFrame(results_log)
results_df.sort_values(by="Test Acc", ascending=False)

# Convergence Visualization

In [None]:
def plot_history(history, title):
    plt.figure()
    plt.plot(history.history["accuracy"])
    plt.plot(history.history["val_accuracy"])
    plt.title(title)
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend(["Train","Validation"])
    plt.show()

plot_history(history_sgd_01, "SGD (0.01)")
plot_history(history_adam, "Adam")
plot_history(history_adamw, "AdamW")

# Confusion Matrix (Best Model)

In [None]:
best_model = model_adamw  # Change if another performs better

y_pred = best_model.predict(x_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

cm = confusion_matrix(y_true, y_pred_classes)

plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=class_names,
            yticklabels=class_names)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Classification Report

In [None]:
print(classification_report(y_true, y_pred_classes, target_names=class_names))

## Final Observations

- Learning rate strongly influences SGD stability and convergence behavior.
- Momentum reduces oscillations and improves convergence stability.
- Adam accelerates early training but did not yield the best generalization in this setup.
- AdamW achieved the strongest test performance through decoupled weight decay.
- Optimization configuration (learning rate, momentum, weight decay) significantly impacts performance alongside architectural design.

### Conclusion

Effective generalization depends not only on model capacity but on disciplined optimization strategy and controlled experimentation.