In [None]:
# 22. Build a feedforward neural network  (NumPy only) for digit classification (MNIST).
import os
import gzip
import struct
import urllib.request
import numpy as np
from time import time

# ---------- HYPERPARAMS ----------
HIDDEN_SIZES = [256, 128]
LEARNING_RATE = 0.1
MOMENTUM = 0.9  # set 0.0 to disable
EPOCHS = 10
BATCH_SIZE = 128
SEED = 42
WEIGHT_SCALE = None  # if None, use Xavier/Glorot init
VALIDATION_SPLIT = 0.1
SAVE_PATH = 'mnist_model.npz'
DOWNLOAD_DIR = 'mnist_data'

np.random.seed(SEED)

# ---------- MNIST download + parsing (IDX format) ----------
MNIST_FILES = {
    'train_images': ('train-images-idx3-ubyte.gz',
                     'https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz'),
    'train_labels': ('train-labels-idx1-ubyte.gz',
                     'https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz'),
    'test_images': ('t10k-images-idx3-ubyte.gz',
                    'https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz'),
    'test_labels': ('t10k-labels-idx1-ubyte.gz',
                    'https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz')
}


def ensure_mnist(download_dir=DOWNLOAD_DIR):
    os.makedirs(download_dir, exist_ok=True)
    for name, (fname, url) in MNIST_FILES.items():
        path = os.path.join(download_dir, fname)
        if not os.path.exists(path):
            print(f'Downloading {fname}...')
            urllib.request.urlretrieve(url, path)
            print('Done')
    print('All files present.')


def parse_idx_images(gz_path):
    with gzip.open(gz_path, 'rb') as f:
        magic, num, rows, cols = struct.unpack('>IIII', f.read(16))
        data = np.frombuffer(f.read(), dtype=np.uint8)
        data = data.reshape(num, rows * cols).astype(np.float32) / 255.0
        return data


def parse_idx_labels(gz_path):
    with gzip.open(gz_path, 'rb') as f:
        magic, num = struct.unpack('>II', f.read(8))
        data = np.frombuffer(f.read(), dtype=np.uint8)
        return data.astype(np.int64)


def load_mnist(download_dir=DOWNLOAD_DIR):
    ensure_mnist(download_dir)
    train_images = parse_idx_images(os.path.join(download_dir, MNIST_FILES['train_images'][0]))
    train_labels = parse_idx_labels(os.path.join(download_dir, MNIST_FILES['train_labels'][0]))
    test_images = parse_idx_images(os.path.join(download_dir, MNIST_FILES['test_images'][0]))
    test_labels = parse_idx_labels(os.path.join(download_dir, MNIST_FILES['test_labels'][0]))
    return train_images, train_labels, test_images, test_labels


# ---------- Utilities ----------

def one_hot(labels, num_classes=10):
    y = np.zeros((labels.shape[0], num_classes), dtype=np.float32)
    y[np.arange(labels.shape[0]), labels] = 1.0
    return y


def accuracy(pred_probs, labels):
    preds = np.argmax(pred_probs, axis=1)
    return np.mean(preds == labels)


# Activation functions and derivatives

def relu(x):
    return np.maximum(0, x)


def relu_grad(x):
    return (x > 0).astype(x.dtype)


def softmax_stable(x):
    # x: (N, C)
    x = x - np.max(x, axis=1, keepdims=True)
    exp = np.exp(x)
    return exp / np.sum(exp, axis=1, keepdims=True)


# Loss (cross-entropy) and gradient w.r.t. logits (for softmax+CE)

def cross_entropy_loss(probs, one_hot_targets):
    # probs: (N, C), already softmaxed
    N = probs.shape[0]
    clipped = np.clip(probs, 1e-12, 1.0)
    loss = -np.sum(one_hot_targets * np.log(clipped)) / N
    return loss


def grad_softmax_cross_entropy(probs, one_hot_targets):
    # derivative of loss w.r.t. logits z when probs = softmax(z)
    return (probs - one_hot_targets) / probs.shape[0]


# ---------- Neural network class (NumPy only) ----------

class FeedForwardNN:
    def __init__(self, input_dim, hidden_sizes, output_dim, weight_scale=None):
        sizes = [input_dim] + hidden_sizes + [output_dim]
        self.num_layers = len(sizes) - 1
        self.W = []
        self.b = []
        for i in range(self.num_layers):
            fan_in, fan_out = sizes[i], sizes[i+1]
            if weight_scale is None:
                # Xavier/Glorot uniform
                limit = np.sqrt(6.0 / (fan_in + fan_out))
                W = np.random.uniform(-limit, limit, size=(fan_in, fan_out)).astype(np.float32)
            else:
                W = np.random.randn(fan_in, fan_out).astype(np.float32) * weight_scale
            b = np.zeros((1, fan_out), dtype=np.float32)
            self.W.append(W)
            self.b.append(b)

        # velocity for momentum
        self.vW = [np.zeros_like(w) for w in self.W]
        self.vb = [np.zeros_like(b) for b in self.b]

    def forward(self, X):
        """Returns logits and caches activations for backprop."""
        activations = [X]
        preacts = []
        a = X
        for i in range(self.num_layers - 1):  # hidden layers
            z = a.dot(self.W[i]) + self.b[i]
            preacts.append(z)
            a = relu(z)
            activations.append(a)
        # final layer (logits)
        z = a.dot(self.W[-1]) + self.b[-1]
        preacts.append(z)
        activations.append(z)  # store logits as last activation
        return z, activations, preacts

    def predict_proba(self, X):
        logits, _, _ = self.forward(X)
        return softmax_stable(logits)

    def save(self, path=SAVE_PATH):
        np.savez(path, *self.W, *self.b)
        print(f'Saved model to {path}')

    def load(self, path):
        data = np.load(path)
        arrs = [data[key] for key in data]
        half = len(arrs) // 2
        self.W = [arrs[i] for i in range(half)]
        self.b = [arrs[i+half] for i in range(half)]
        self.vW = [np.zeros_like(w) for w in self.W]
        self.vb = [np.zeros_like(b) for b in self.b]

    def update_params(self, grads_W, grads_b, lr, momentum):
        for i in range(self.num_layers):
            if momentum > 0:
                self.vW[i] = momentum * self.vW[i] - lr * grads_W[i]
                self.vb[i] = momentum * self.vb[i] - lr * grads_b[i]
                self.W[i] += self.vW[i]
                self.b[i] += self.vb[i]
            else:
                self.W[i] -= lr * grads_W[i]
                self.b[i] -= lr * grads_b[i]

    def backward(self, activations, preacts, logits, onehot_targets):
        # compute gradients via backprop
        grads_W = [None] * self.num_layers
        grads_b = [None] * self.num_layers

        # gradient at logits
        probs = softmax_stable(logits)
        delta = grad_softmax_cross_entropy(probs, onehot_targets)  # shape (N, C)

        # last layer
        a_prev = activations[-2]  # output of last hidden layer
        grads_W[-1] = a_prev.T.dot(delta)
        grads_b[-1] = np.sum(delta, axis=0, keepdims=True)

        # backprop through hidden layers
        delta_prev = delta
        for l in range(self.num_layers - 2, -1, -1):
            W_next = self.W[l+1]
            # propagate
            delta = delta_prev.dot(W_next.T) * relu_grad(preacts[l])
            a_prev = activations[l]
            grads_W[l] = a_prev.T.dot(delta)
            grads_b[l] = np.sum(delta, axis=0, keepdims=True)
            delta_prev = delta

        return grads_W, grads_b


# ---------- Training loop ----------

def iterate_minibatches(X, y, batch_size, shuffle=True):
    N = X.shape[0]
    idx = np.arange(N)
    if shuffle:
        np.random.shuffle(idx)
    for start in range(0, N, batch_size):
        end = min(start + batch_size, N)
        batch_idx = idx[start:end]
        yield X[batch_idx], y[batch_idx]


def train(model, X_train, y_train_onehot, y_train_labels, X_val, y_val_labels,
          epochs, batch_size, lr, momentum):
    N = X_train.shape[0]

    for epoch in range(1, epochs + 1):
        t0 = time()
        epoch_loss = 0.0
        batches = 0
        for X_batch, y_batch in iterate_minibatches(X_train, y_train_onehot, batch_size):
            logits, activations, preacts = model.forward(X_batch)
            probs = softmax_stable(logits)
            loss = cross_entropy_loss(probs, y_batch)
            grads_W, grads_b = model.backward(activations, preacts, logits, y_batch)
            # normalize grads by batch size (already in grad function for softmax/CE), but grads_W are sums
            # since our grad_softmax_cross_entropy normalized by N, grads_W currently are scaled by batch_size/N.
            # To be safe, divide grads by 1.0 (they are already averaged because delta was averaged in grad func).
            model.update_params(grads_W, grads_b, lr, momentum)

            epoch_loss += loss
            batches += 1

        t1 = time()
        train_probs = model.predict_proba(X_train)
        train_acc = accuracy(train_probs, y_train_labels)
        val_probs = model.predict_proba(X_val)
        val_acc = accuracy(val_probs, y_val_labels)

        print(f'Epoch {epoch}/{epochs} - loss: {epoch_loss / batches:.4f} - '
              f'train acc: {train_acc*100:.2f}% - val acc: {val_acc*100:.2f}% - time: {t1-t0:.2f}s')


# ---------- Main ----------

if __name__ == '__main__':
    print('Loading MNIST...')
    X_train, y_train, X_test, y_test = load_mnist()

    # shuffle train set and split validation
    perm = np.random.permutation(X_train.shape[0])
    X_train = X_train[perm]
    y_train = y_train[perm]

    val_size = int(len(X_train) * VALIDATION_SPLIT)
    X_val = X_train[:val_size]
    y_val = y_train[:val_size]
    X_train = X_train[val_size:]
    y_train = y_train[val_size:]

    print('Shapes:')
    print(' X_train', X_train.shape)
    print(' y_train', y_train.shape)
    print(' X_val', X_val.shape)
    print(' X_test', X_test.shape)
    print(' y_test', y_test.shape)

    # Convert labels to one-hot
    y_train_onehot = one_hot(y_train, 10)
    y_val_onehot = one_hot(y_val, 10)
    y_test_onehot = one_hot(y_test, 10)

    model = FeedForwardNN(input_dim=784, hidden_sizes=HIDDEN_SIZES, output_dim=10, weight_scale=WEIGHT_SCALE)

    print('Starting training...')
    train(model,
          X_train, y_train_onehot, y_train,
          X_val, y_val,
          epochs=EPOCHS,
          batch_size=BATCH_SIZE,
          lr=LEARNING_RATE,
          momentum=MOMENTUM)

    # final test accuracy
    test_probs = model.predict_proba(X_test)
    test_acc = accuracy(test_probs, y_test)
    print(f'Final test accuracy: {test_acc*100:.2f}%')

    # save model
    model.save(SAVE_PATH)

    print('Done.')


Loading MNIST...
Downloading train-images-idx3-ubyte.gz...
Done
Downloading train-labels-idx1-ubyte.gz...
Done
Downloading t10k-images-idx3-ubyte.gz...
Done
Downloading t10k-labels-idx1-ubyte.gz...
Done
All files present.
Shapes:
 X_train (54000, 784)
 y_train (54000,)
 X_val (6000, 784)
 X_test (10000, 784)
 y_test (10000,)
Starting training...
Epoch 1/10 - loss: 0.2684 - train acc: 96.82% - val acc: 96.17% - time: 4.17s
Epoch 2/10 - loss: 0.0969 - train acc: 97.42% - val acc: 96.45% - time: 1.65s
Epoch 3/10 - loss: 0.0685 - train acc: 98.51% - val acc: 97.48% - time: 2.38s
Epoch 4/10 - loss: 0.0502 - train acc: 98.97% - val acc: 97.65% - time: 1.86s
Epoch 5/10 - loss: 0.0390 - train acc: 99.25% - val acc: 98.00% - time: 3.88s
Epoch 6/10 - loss: 0.0278 - train acc: 98.60% - val acc: 97.33% - time: 1.76s
Epoch 7/10 - loss: 0.0220 - train acc: 99.63% - val acc: 98.13% - time: 1.88s
Epoch 8/10 - loss: 0.0187 - train acc: 99.58% - val acc: 98.12% - time: 1.78s
Epoch 9/10 - loss: 0.0137 - 

In [None]:
# 23. Implement a Convolutional Neural Network (CNN) using TensorFlow/Keras for CIFAR-10.
"""
CIFAR-10 CNN with TensorFlow / Keras

Usage:
    python cifar10_cnn_keras.py

Features:
 - Loads CIFAR-10 from tf.keras.datasets
 - Builds a moderately deep CNN with Conv-BN-ReLU blocks
 - Uses data augmentation (random flip, translation, cutout optional)
 - Uses callbacks: ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
 - Saves the best model and prints test accuracy

Requirements:
 - TensorFlow 2.x (tested on 2.12+)

Tweak hyperparameters in the HYPERPARAMS block.
"""

import os
from datetime import datetime
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# ---------- HYPERPARAMS ----------
BATCH_SIZE = 256
EPOCHS = 20
NUM_CLASSES = 10
INPUT_SHAPE = (32, 32, 3)
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-4
MODEL_DIR = 'cifar10_cnn_model'
SEED = 42
AUGMENTATION = True

tf.random.set_seed(SEED)
np.random.seed(SEED)

os.makedirs(MODEL_DIR, exist_ok=True)

# ---------- Data loading and preprocessing ----------
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
# Normalize to [0,1]
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# split off a validation set from train
val_fraction = 0.1
num_val = int(len(x_train) * val_fraction)
indices = np.arange(len(x_train))
np.random.shuffle(indices)
val_idx = indices[:num_val]
train_idx = indices[num_val:]

x_val = x_train[val_idx]
y_val = y_train[val_idx]

x_train = x_train[train_idx]
y_train = y_train[train_idx]

print('Train / Val / Test shapes:', x_train.shape, x_val.shape, x_test.shape)

# One-hot encode labels
y_train_cat = keras.utils.to_categorical(y_train, NUM_CLASSES)
y_val_cat = keras.utils.to_categorical(y_val, NUM_CLASSES)
y_test_cat = keras.utils.to_categorical(y_test, NUM_CLASSES)

# ---------- Data augmentation pipeline (tf.keras.layers) ----------
if AUGMENTATION:
    data_augmentation = keras.Sequential([
        layers.RandomFlip('horizontal'),
        layers.RandomTranslation(0.1, 0.1),
        layers.RandomRotation(0.05),
    ], name='data_augmentation')
else:
    data_augmentation = keras.Sequential([], name='data_augmentation')

# Optional Cutout function
@tf.function
def random_cutout(images, mask_size=8):
    # images: [B, H, W, C]
    batch_size = tf.shape(images)[0]
    img_h = tf.shape(images)[1]
    img_w = tf.shape(images)[2]

    y = tf.random.uniform([batch_size], 0, img_h, dtype=tf.int32)
    x = tf.random.uniform([batch_size], 0, img_w, dtype=tf.int32)

    y1 = tf.clip_by_value(y - mask_size // 2, 0, img_h)
    x1 = tf.clip_by_value(x - mask_size // 2, 0, img_w)
    y2 = tf.clip_by_value(y1 + mask_size, 0, img_h)
    x2 = tf.clip_by_value(x1 + mask_size, 0, img_w)

    masks = tf.ones([batch_size, img_h, img_w, 1], dtype=images.dtype)
    for i in range(batch_size):
        masks = tf.tensor_scatter_nd_update(masks,
                                            indices=[[i, y1[i], x1[i], 0]],
                                            updates=[0.0])
    # This is a simplified cutout placeholder (not perfect). For production use, vectorize properly.
    return images * masks

# ---------- Model definition ----------

def conv_block(x, filters, kernel_size=3, pool=True):
    x = layers.Conv2D(filters, kernel_size, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Conv2D(filters, kernel_size, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    if pool:
        x = layers.MaxPooling2D(pool_size=2)(x)
    return x


def build_model(input_shape=INPUT_SHAPE, num_classes=NUM_CLASSES, weight_decay=WEIGHT_DECAY):
    inputs = keras.Input(shape=input_shape)

    x = data_augmentation(inputs)

    # stem
    x = layers.Conv2D(64, 3, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)

    # conv blocks
    x = conv_block(x, 64)
    x = conv_block(x, 128)
    x = conv_block(x, 256, pool=True)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(256, use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Dropout(0.5)(x)

    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = keras.Model(inputs=inputs, outputs=outputs, name='cifar10_cnn')
    return model

# Build model
model = build_model()
model.summary()

# ---------- Optimizer, loss, metrics ----------
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = keras.losses.CategoricalCrossentropy()
metrics = ['accuracy']

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# ---------- Callbacks ----------
now = datetime.now().strftime('%Y%m%d-%H%M%S')
checkpoint_path = os.path.join(MODEL_DIR, f'best_model_{now}.h5')
callbacks = [
    keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, verbose=1),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, verbose=1),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=12, verbose=1, restore_best_weights=True)
]

# ---------- Training ----------
# Use tf.data for performance
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train_cat))
train_ds = train_ds.shuffle(10000, seed=SEED)
train_ds = train_ds.batch(BATCH_SIZE)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val_cat))
val_ds = val_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# If you want to apply augmentation per-batch using the Keras layer, keep as-is. Alternatively, map augmentation:
if AUGMENTATION:
    # map augmentation on the dataset (applies the Sequential augmentation layers)
    train_ds = train_ds.map(lambda x, y: (data_augmentation(x, training=True), y), num_parallel_calls=tf.data.AUTOTUNE)

history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
    callbacks=callbacks,
)

# ---------- Evaluation ----------
print('\nEvaluating on test set...')
# load best model if checkpoint saved
if os.path.exists(checkpoint_path):
    print('Loading best model from checkpoint...')
    model = keras.models.load_model(checkpoint_path)

test_loss, test_acc = model.evaluate(x_test, y_test_cat, batch_size=BATCH_SIZE)
print(f'Test loss: {test_loss:.4f} - Test accuracy: {test_acc*100:.2f}%')

# Save final model
final_path = os.path.join(MODEL_DIR, f'final_model_{now}.h5')
model.save(final_path)
print(f'Final model saved to {final_path}')

print('Done.')


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 1us/step
Train / Val / Test shapes: (45000, 32, 32, 3) (5000, 32, 32, 3) (10000, 32, 32, 3)


Epoch 1/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 0.3152 - loss: 1.9866
Epoch 1: val_accuracy improved from -inf to 0.14280, saving model to cifar10_cnn_model/best_model_20251117-135615.h5




[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 132ms/step - accuracy: 0.3156 - loss: 1.9852 - val_accuracy: 0.1428 - val_loss: 3.1388 - learning_rate: 0.0010
Epoch 2/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 0.5042 - loss: 1.3870
Epoch 2: val_accuracy improved from 0.14280 to 0.29240, saving model to cifar10_cnn_model/best_model_20251117-135615.h5




[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 123ms/step - accuracy: 0.5043 - loss: 1.3866 - val_accuracy: 0.2924 - val_loss: 2.3289 - learning_rate: 0.0010
Epoch 3/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step - accuracy: 0.5830 - loss: 1.1699
Epoch 3: val_accuracy improved from 0.29240 to 0.37460, saving model to cifar10_cnn_model/best_model_20251117-135615.h5




[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 123ms/step - accuracy: 0.5831 - loss: 1.1697 - val_accuracy: 0.3746 - val_loss: 2.4442 - learning_rate: 0.0010
Epoch 4/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.6451 - loss: 1.0152
Epoch 4: val_accuracy improved from 0.37460 to 0.49460, saving model to cifar10_cnn_model/best_model_20251117-135615.h5




[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 124ms/step - accuracy: 0.6452 - loss: 1.0151 - val_accuracy: 0.4946 - val_loss: 1.7309 - learning_rate: 0.0010
Epoch 5/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.6745 - loss: 0.9272
Epoch 5: val_accuracy improved from 0.49460 to 0.60220, saving model to cifar10_cnn_model/best_model_20251117-135615.h5




[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 124ms/step - accuracy: 0.6746 - loss: 0.9271 - val_accuracy: 0.6022 - val_loss: 1.1504 - learning_rate: 0.0010
Epoch 6/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.7114 - loss: 0.8350
Epoch 6: val_accuracy did not improve from 0.60220
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 124ms/step - accuracy: 0.7114 - loss: 0.8350 - val_accuracy: 0.5452 - val_loss: 1.5890 - learning_rate: 0.0010
Epoch 7/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.7275 - loss: 0.7866
Epoch 7: val_accuracy did not improve from 0.60220
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 124ms/step - accuracy: 0.7275 - loss: 0.7866 - val_accuracy: 0.5544 - val_loss: 1.7819 - learning_rate: 0.0010
Epoch 8/20
[1m176/176[0m [32m━━━━━━━━━━━



[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 126ms/step - accuracy: 0.7624 - loss: 0.6926 - val_accuracy: 0.6292 - val_loss: 1.2730 - learning_rate: 0.0010
Epoch 10/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - accuracy: 0.7667 - loss: 0.6770
Epoch 10: val_accuracy improved from 0.62920 to 0.74540, saving model to cifar10_cnn_model/best_model_20251117-135615.h5




[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 126ms/step - accuracy: 0.7667 - loss: 0.6769 - val_accuracy: 0.7454 - val_loss: 0.7500 - learning_rate: 0.0010
Epoch 11/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.7841 - loss: 0.6333
Epoch 11: val_accuracy did not improve from 0.74540
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 126ms/step - accuracy: 0.7841 - loss: 0.6333 - val_accuracy: 0.6510 - val_loss: 1.0869 - learning_rate: 0.0010
Epoch 12/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.7887 - loss: 0.6061
Epoch 12: val_accuracy did not improve from 0.74540
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 126ms/step - accuracy: 0.7887 - loss: 0.6060 - val_accuracy: 0.7274 - val_loss: 0.7609 - learning_rate: 0.0010
Epoch 13/20
[1m176/176[0m [32m━━━━━━



[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 127ms/step - accuracy: 0.8164 - loss: 0.5405 - val_accuracy: 0.7498 - val_loss: 0.7480 - learning_rate: 0.0010
Epoch 16/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.8212 - loss: 0.5201
Epoch 16: val_accuracy did not improve from 0.74980
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 126ms/step - accuracy: 0.8212 - loss: 0.5202 - val_accuracy: 0.6910 - val_loss: 0.9481 - learning_rate: 0.0010
Epoch 17/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.8245 - loss: 0.5005
Epoch 17: val_accuracy did not improve from 0.74980
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 127ms/step - accuracy: 0.8245 - loss: 0.5005 - val_accuracy: 0.7456 - val_loss: 0.7360 - learning_rate: 0.0010
Epoch 18/20
[1m176/176[0m [32m━━━━━━



[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 127ms/step - accuracy: 0.8364 - loss: 0.4736 - val_accuracy: 0.7550 - val_loss: 0.7332 - learning_rate: 0.0010
Epoch 20/20
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.8426 - loss: 0.4525
Epoch 20: val_accuracy did not improve from 0.75500
[1m176/176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 126ms/step - accuracy: 0.8426 - loss: 0.4526 - val_accuracy: 0.6826 - val_loss: 1.0697 - learning_rate: 0.0010
Restoring model weights from the end of the best epoch: 19.





Evaluating on test set...
Loading best model from checkpoint...
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.7569 - loss: 0.7572




Test loss: 0.7499 - Test accuracy: 75.93%
Final model saved to cifar10_cnn_model/final_model_20251117-135615.h5
Done.


### Why is CNN training taking so long?

Training deep Convolutional Neural Networks (CNNs) like the one in the previous cell is computationally intensive. Here are the primary reasons and how to address them:

1.  **Computational Power**: Deep learning models require significant processing power. If you are running on a CPU runtime, it will be much slower than on a GPU.
    *   **Solution**: Ensure you are using a **GPU runtime** in Colab. Go to `Runtime` > `Change runtime type` and select `GPU` as the hardware accelerator.

2.  **Number of Epochs**: The current configuration trains for `80` epochs. Each epoch involves processing the entire dataset multiple times for both forward and backward passes.
    *   **Solution**: For quicker experimentation, you can reduce the `EPOCHS` hyperparameter (e.g., to `5` or `10`) in the `HYPERPARAMS` section of the code.

3.  **Data Augmentation**: Data augmentation techniques (like `RandomFlip`, `RandomTranslation`, `RandomRotation`) are applied to each image during training. While beneficial for model generalization, they add processing time per batch.

By switching to a GPU runtime, you should see a drastic improvement in training speed. Reducing the number of epochs is also a good temporary measure for faster feedback.

In [None]:
# 24. Train an RNN (LSTM) for next-word prediction on a text dataset.
"""
Next-word prediction with LSTM (TensorFlow / Keras)

- Downloads a small text dataset (Shakespeare) and trains a word-level LSTM that
  predicts the next word given a short context (seq_len words -> next word).
- Saves the trained model and the tokenizer for later inference.
- Includes a `generate_text` helper that samples next words with temperature.

Run:
    python nextword_lstm_keras.py

Requirements:
    tensorflow (2.x), numpy

You can change DATA_URL to point to your own .txt file.
"""

import os
import re
import json
import pickle
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# ----------------- HYPERPARAMS -----------------
DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'
SEQ_LEN = 6          # number of input words (context length)
BATCH_SIZE = 128
EMBED_DIM = 128
LSTM_UNITS = 256
EPOCHS = 15
BUFFER_SIZE = 10000
LEARNING_RATE = 1e-3
VALIDATION_SPLIT = 0.1
MODEL_DIR = 'lstm_nextword_model'
TOKENIZER_PATH = os.path.join(MODEL_DIR, 'tokenizer.pkl')
SEED = 42

os.makedirs(MODEL_DIR, exist_ok=True)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# ----------------- Download + read dataset -----------------
print('Downloading dataset...')
path = keras.utils.get_file('shakespeare.txt', DATA_URL)
with open(path, 'r', encoding='utf-8') as f:
    text = f.read()
print(f'Dataset length (chars): {len(text)}')

# ----------------- Preprocessing (word-level) -----------------
# Basic cleanup: collapse whitespace, keep punctuation as separate tokens
text = text.lower()
text = re.sub(r"\s+", ' ', text)

# Use Keras Tokenizer for word-level tokenization
print('Tokenizing (word-level)...')
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(filters='')  # keep punctuation as tokens if present
# fit on texts split by whitespace to ensure word-level tokens
tokens = text.split(' ')
# join with single spaces to keep consistent formatting for tokenizer
tokenizer.fit_on_texts([' '.join(tokens)])

vocab_size = len(tokenizer.word_index) + 1
print('Vocab size:', vocab_size)

# Convert full text to sequence of integer tokens
seq = tokenizer.texts_to_sequences([' '.join(tokens)])[0]
seq = np.array(seq, dtype=np.int32)

# Build dataset of (seq_len -> next_word) pairs using sliding window
inputs = []
targets = []
for i in range(len(seq) - SEQ_LEN):
    inputs.append(seq[i:i+SEQ_LEN])
    targets.append(seq[i+SEQ_LEN])
inputs = np.array(inputs, dtype=np.int32)
targets = np.array(targets, dtype=np.int32)
print('Total examples:', inputs.shape[0])

# Shuffle and split
indices = np.arange(inputs.shape[0])
np.random.shuffle(indices)
inputs = inputs[indices]
targets = targets[indices]

val_size = int(inputs.shape[0] * VALIDATION_SPLIT)
X_val = inputs[:val_size]
y_val = targets[:val_size]
X_train = inputs[val_size:]
y_train = targets[val_size:]
print('Train / Val shapes:', X_train.shape, X_val.shape)

# ----------------- tf.data pipelines -----------------
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_ds = train_ds.shuffle(BUFFER_SIZE, seed=SEED).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# ----------------- Model -----------------
print('Building model...')
inputs_layer = keras.Input(shape=(SEQ_LEN,), dtype='int32')
# embedding maps token ids -> vectors
x = layers.Embedding(input_dim=vocab_size, output_dim=EMBED_DIM, input_length=SEQ_LEN)(inputs_layer)
# you can stack LSTMs or use Bidirectional if desired
x = layers.LSTM(LSTM_UNITS, return_sequences=False)(x)
x = layers.Dropout(0.3)(x)
outputs = layers.Dense(vocab_size, activation='softmax')(x)

model = keras.Model(inputs=inputs_layer, outputs=outputs)
model.summary()

optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# ----------------- Callbacks -----------------
checkpoint_path = os.path.join(MODEL_DIR, 'best_lstm_nextword.h5')
callbacks = [
    keras.callbacks.ModelCheckpoint(checkpoint_path, save_best_only=True, monitor='val_loss', verbose=1),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)
]

# ----------------- Train -----------------
print('Starting training...')
history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=callbacks)

# Load best model
if os.path.exists(checkpoint_path):
    model = keras.models.load_model(checkpoint_path)

# Save tokenizer
with open(TOKENIZER_PATH, 'wb') as f:
    pickle.dump(tokenizer, f)
print('Saved tokenizer to', TOKENIZER_PATH)

# Save final model
final_model_path = os.path.join(MODEL_DIR, 'final_model.h5')
model.save(final_model_path)
print('Saved final model to', final_model_path)

# ----------------- Inference helper: generate text -----------------

def sample_from_probs(probs, temperature=1.0):
    # probs: 1D numpy array over vocab
    probs = np.asarray(probs).astype('float64')
    if temperature <= 0:
        return np.argmax(probs)
    probs = np.log(probs + 1e-12) / temperature
    exp = np.exp(probs - np.max(probs))
    probs = exp / np.sum(exp)
    return np.random.choice(len(probs), p=probs)


def generate_text(seed_text, num_words=20, temperature=1.0):
    # seed_text: string of words (can be shorter than SEQ_LEN)
    with open(TOKENIZER_PATH, 'rb') as f:
        tk = pickle.load(f)
    words = seed_text.lower().split()
    for _ in range(num_words):
        # build input sequence of last SEQ_LEN tokens
        seq_tokens = tk.texts_to_sequences([' '.join(words)])[0]
        if len(seq_tokens) < SEQ_LEN:
            pad = [0] * (SEQ_LEN - len(seq_tokens))
            input_seq = np.array([pad + seq_tokens])
        else:
            input_seq = np.array([seq_tokens[-SEQ_LEN:]])
        preds = model.predict(input_seq, verbose=0)[0]
        next_id = sample_from_probs(preds, temperature)
        # map id back to word (tokenizer.word_index is word->id)
        # build inverse mapping
        inv_map = {v: k for k, v in tk.word_index.items()}
        next_word = inv_map.get(next_id, '')
        if next_word == '':
            break
        words.append(next_word)
    return ' '.join(words)

# Example generation
print('\nExample generations:')
seed = 'to be or not to be'
print('Seed:', seed)
print('Generated (temp=0.8):', generate_text(seed, num_words=30, temperature=0.8))
print('Generated (temp=1.2):', generate_text(seed, num_words=30, temperature=1.2))

print('\nDone.')


In [None]:
# 25. Write code for Q-learning in a simple grid-world environment.
"""
Simple Grid-World + Q-Learning (pure Python + NumPy)

Run this file to train a Q-learning agent to navigate a small grid world.
Features:
 - customizable grid size, start, goal, and obstacles
 - epsilon-greedy policy, learning rate, discount factor
 - tracks episodic returns and success rate
 - prints learned policy and Q-table
 - optional visual rendering in terminal

Usage:
    python q_learning_gridworld.py

This is intentionally dependency-light (only NumPy and matplotlib optional for plotting).
"""

import numpy as np
import random
import matplotlib.pyplot as plt

# ----------------------- Environment -----------------------
class GridWorld:
    """A simple deterministic grid world.
    States are (row, col). Actions: 0=up,1=right,2=down,3=left.
    Rewards: step_reward for each step, goal_reward at reaching goal, and obstacle penalty.
    """

    def __init__(self, n_rows=5, n_cols=5, start=(0,0), goal=(4,4), obstacles=None,
                 step_reward=-0.1, goal_reward=1.0, obstacle_reward=-1.0, max_steps=100):
        self.n_rows = n_rows
        self.n_cols = n_cols
        self.start = start
        self.state = start
        self.goal = goal
        self.obstacles = set(obstacles) if obstacles is not None else set()
        self.step_reward = step_reward
        self.goal_reward = goal_reward
        self.obstacle_reward = obstacle_reward
        self.max_steps = max_steps
        self.steps = 0

        # action space and state space sizes
        self.n_actions = 4
        self.n_states = n_rows * n_cols

    def state_to_index(self, state):
        r, c = state
        return r * self.n_cols + c

    def index_to_state(self, idx):
        r = idx // self.n_cols
        c = idx % self.n_cols
        return (r, c)

    def reset(self):
        self.state = self.start
        self.steps = 0
        return self.state_to_index(self.state)

    def in_bounds(self, r, c):
        return 0 <= r < self.n_rows and 0 <= c < self.n_cols

    def step(self, action):
        """Take action and return: next_state_index, reward, done, info"""
        r, c = self.state
        if action == 0:  # up
            nr, nc = r - 1, c
        elif action == 1:  # right
            nr, nc = r, c + 1
        elif action == 2:  # down
            nr, nc = r + 1, c
        elif action == 3:  # left
            nr, nc = r, c - 1
        else:
            raise ValueError('Invalid action')

        # if out of bounds, stay in place
        if not self.in_bounds(nr, nc):
            nr, nc = r, c

        self.state = (nr, nc)
        self.steps += 1

        # compute reward
        if self.state == self.goal:
            reward = self.goal_reward
            done = True
        elif self.state in self.obstacles:
            reward = self.obstacle_reward
            done = False
        else:
            reward = self.step_reward
            done = False

        # episode ends if too many steps
        if self.steps >= self.max_steps:
            done = True

        return self.state_to_index(self.state), reward, done, {}

    def render(self, policy=None):
        """Print grid, agent position (A), goal (G), obstacles (X). Optionally show policy arrows."""
        grid = [['.' for _ in range(self.n_cols)] for _ in range(self.n_rows)]
        for (orow, ocol) in self.obstacles:
            grid[orow][ocol] = 'X'
        gr, gc = self.goal
        grid[gr][gc] = 'G'
        ar, ac = self.state
        grid[ar][ac] = 'A'

        if policy is not None:
            # policy is array of action ints for each state index
            arrows = {0: '^', 1: '>', 2: 'v', 3: '<'}
            print('Policy map (arrows show greedy action):')
            for r in range(self.n_rows):
                rowstr = ''
                for c in range(self.n_cols):
                    sidx = self.state_to_index((r,c))
                    if (r,c) == self.goal:
                        rowstr += ' G '
                    elif (r,c) in self.obstacles:
                        rowstr += ' X '
                    else:
                        rowstr += f' {arrows.get(policy[sidx], ".")} '
                print(rowstr)
            print()

        print('Grid:')
        for r in range(self.n_rows):
            print(' '.join(grid[r]))
        print()

# ----------------------- Q-Learning Agent -----------------------
class QLearningAgent:
    def __init__(self, n_states, n_actions, lr=0.1, gamma=0.99, epsilon=1.0, epsilon_min=0.05, epsilon_decay=0.995):
        self.n_states = n_states
        self.n_actions = n_actions
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        # initialize Q-table to zeros
        self.Q = np.zeros((n_states, n_actions), dtype=np.float32)

    def get_action(self, state_idx):
        # epsilon-greedy
        if random.random() < self.epsilon:
            return random.randrange(self.n_actions)
        else:
            return int(np.argmax(self.Q[state_idx]))

    def update(self, s_idx, a, r, s_next_idx, done):
        q = self.Q[s_idx, a]
        if done:
            target = r
        else:
            target = r + self.gamma * np.max(self.Q[s_next_idx])
        self.Q[s_idx, a] = q + self.lr * (target - q)

    def decay_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            if self.epsilon < self.epsilon_min:
                self.epsilon = self.epsilon_min

    def greedy_policy(self):
        return np.argmax(self.Q, axis=1)

# ----------------------- Training loop -----------------------

def train_q_learning(env, agent, episodes=500, max_steps_per_episode=100, render_every=0):
    rewards = []
    successes = []

    for ep in range(1, episodes+1):
        s = env.reset()
        total_reward = 0.0
        done = False
        for t in range(max_steps_per_episode):
            a = agent.get_action(s)
            s_next, r, done, _ = env.step(a)
            agent.update(s, a, r, s_next, done)
            s = s_next
            total_reward += r
            if done:
                break

        agent.decay_epsilon()
        rewards.append(total_reward)
        successes.append(1.0 if env.state == env.goal else 0.0)

        if render_every > 0 and ep % render_every == 0:
            print(f'Episode {ep} - total_reward: {total_reward:.2f} - epsilon: {agent.epsilon:.3f} - success: {successes[-1]}')

    return rewards, successes

# ----------------------- Example usage -----------------------
if __name__ == '__main__':
    # build a simple 6x6 grid with obstacles
    n_rows, n_cols = 6, 6
    start = (0, 0)
    goal = (5, 5)
    obstacles = {(1,1), (2,1), (3,1), (4,1), (4,2), (4,3)}  # a wall with a gap

    env = GridWorld(n_rows=n_rows, n_cols=n_cols, start=start, goal=goal, obstacles=obstacles,
                    step_reward=-0.04, goal_reward=1.0, obstacle_reward=-1.0, max_steps=200)

    agent = QLearningAgent(n_states=env.n_states, n_actions=env.n_actions,
                           lr=0.5, gamma=0.98, epsilon=1.0, epsilon_min=0.05, epsilon_decay=0.995)

    episodes = 1500
    rewards, successes = train_q_learning(env, agent, episodes=episodes, max_steps_per_episode=200, render_every=100)

    # plot learning curve
    plt.figure(figsize=(10,4))
    plt.subplot(1,2,1)
    plt.plot(rewards)
    plt.title('Episode reward')
    plt.xlabel('Episode')
    plt.ylabel('Total reward')

    plt.subplot(1,2,2)
    # moving average success rate
    window = 50
    success_ma = np.convolve(successes, np.ones(window)/window, mode='valid')
    plt.plot(success_ma)
    plt.title('Success rate (moving avg)')
    plt.xlabel('Episode')
    plt.ylabel('Success rate')

    plt.tight_layout()
    plt.show()

    # Show learned policy
    policy = agent.greedy_policy()
    print('\nLearned greedy policy (arrows: ^ > v <):')
    env.render(policy=policy)

    # Print Q-values for start state
    start_idx = env.state_to_index(start)
    print('Q-values at start state:', agent.Q[start_idx])

    # Demonstrate an episode following greedy policy
    s = env.reset()
    env.render()
    print('Greedy rollout:')
    for t in range(30):
        a = int(policy[s])
        s_next, r, done, _ = env.step(a)
        env.render()
        if done:
            print('Episode finished, reached goal' if env.state == env.goal else 'Episode finished (max steps)')
            break
        s = s_next

    print('Done.')


In [None]:
#26. Implement backpropagation and gradient descent for a small neural network (NumPy only).
"""
Backpropagation + Gradient Descent (NumPy-only)

This script implements a small fully-connected neural network from scratch using only NumPy.
It performs manual forward and backward passes (no autograd) and trains with mini-batch
gradient descent and optional momentum.

Features:
 - Dense layer implementation with Xavier init
 - Activation functions: sigmoid, tanh, ReLU, softmax
 - Losses: Mean Squared Error (MSE) and Cross-Entropy (with softmax)
 - Full vectorized forward/backward passes
 - Gradient checking utility (finite differences) to verify backprop
 - Small examples: learn XOR (binary) and a synthetic 3-class classification problem

Usage:
    python backprop_numpy.py

Tweak hyperparameters in the HYPERPARAMS section.
"""

import numpy as np

# ------------------- HYPERPARAMS -------------------
SEED = 42
np.random.seed(SEED)

HIDDEN_SIZES = [8]          # list of hidden layer sizes (e.g. [8, 8])
ACTIVATION = 'tanh'         # 'sigmoid', 'tanh', 'relu'
LOSS = 'cross_entropy'      # 'mse' or 'cross_entropy'
LEARNING_RATE = 0.1
MOMENTUM = 0.9
EPOCHS = 1000
BATCH_SIZE = 4
PRINT_EVERY = 100
GRAD_CHECK = False          # set True to run gradient check on small batch

# ------------------- Utility functions -------------------

def one_hot(y, num_classes):
    Y = np.zeros((len(y), num_classes))
    Y[np.arange(len(y)), y] = 1.0
    return Y

# Activation functions and derivatives

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def sigmoid_grad(x):
    s = sigmoid(x)
    return s * (1 - s)


def tanh(x):
    return np.tanh(x)


def tanh_grad(x):
    return 1 - np.tanh(x) ** 2


def relu(x):
    return np.maximum(0, x)


def relu_grad(x):
    return (x > 0).astype(x.dtype)


def softmax(x):
    # x shape: (N, C)
    z = x - np.max(x, axis=1, keepdims=True)
    exp = np.exp(z)
    return exp / np.sum(exp, axis=1, keepdims=True)

# Losses and gradients

def mse_loss(y_pred, y_true):
    # both shapes (N, C) or (N,1)
    N = y_pred.shape[0]
    loss = 0.5 * np.sum((y_pred - y_true) ** 2) / N
    return loss


def mse_grad(y_pred, y_true):
    N = y_pred.shape[0]
    return (y_pred - y_true) / N


def cross_entropy_loss_logits(logits, y_true_onehot):
    # logits: raw outputs before softmax; y_true_onehot: one-hot
    probs = softmax(logits)
    N = logits.shape[0]
    clipped = np.clip(probs, 1e-12, 1.0)
    loss = -np.sum(y_true_onehot * np.log(clipped)) / N
    return loss


def grad_softmax_cross_entropy(logits, y_true_onehot):
    # returns gradient dL/dlogits for batch
    probs = softmax(logits)
    return (probs - y_true_onehot) / logits.shape[0]

# ------------------- Neural Network (manual backprop) -------------------
class SimpleMLP:
    def __init__(self, input_dim, hidden_sizes, output_dim, activation='tanh', weight_scale=None):
        self.sizes = [input_dim] + hidden_sizes + [output_dim]
        self.num_layers = len(self.sizes) - 1
        self.activation_name = activation

        # initialize weights and biases (Xavier)
        self.W = []
        self.b = []
        for i in range(self.num_layers):
            fan_in = self.sizes[i]
            fan_out = self.sizes[i+1]
            limit = np.sqrt(6.0 / (fan_in + fan_out))
            W = np.random.uniform(-limit, limit, size=(fan_in, fan_out))
            b = np.zeros((1, fan_out))
            self.W.append(W)
            self.b.append(b)

        # velocity for momentum
        self.vW = [np.zeros_like(W) for W in self.W]
        self.vb = [np.zeros_like(b) for b in self.b]

    def activation(self, x):
        if self.activation_name == 'sigmoid':
            return sigmoid(x)
        elif self.activation_name == 'tanh':
            return tanh(x)
        elif self.activation_name == 'relu':
            return relu(x)
        else:
            raise ValueError('Unknown activation')

    def activation_grad(self, x):
        if self.activation_name == 'sigmoid':
            return sigmoid_grad(x)
        elif self.activation_name == 'tanh':
            return tanh_grad(x)
        elif self.activation_name == 'relu':
            return relu_grad(x)
        else:
            raise ValueError('Unknown activation')

    def forward(self, X):
        """Forward pass. Returns logits (before softmax for classification) and caches.
        caches: list of tuples (z, a) where z = W^T a_prev + b, a = activation(z) (for hidden layers)
        """
        a = X
        caches = []
        for i in range(self.num_layers - 1):  # hidden layers
            z = a.dot(self.W[i]) + self.b[i]
            a = self.activation(z)
            caches.append((z, a))
        # output layer (logits)
        z = a.dot(self.W[-1]) + self.b[-1]
        caches.append((z, None))  # None placeholder for activation on output
        return z, caches

    def predict(self, X):
        logits, _ = self.forward(X)
        if LOSS == 'mse':
            return logits
        else:
            return softmax(logits)

    def compute_loss_and_grads(self, X, y):
        """Compute loss and gradients for a batch (vectorized).
        y: either one-hot (for cross-entropy) or continuous targets for MSE
        Returns: loss, grads_W, grads_b
        """
        logits, caches = self.forward(X)
        grads_W = [np.zeros_like(W) for W in self.W]
        grads_b = [np.zeros_like(b) for b in self.b]

        if LOSS == 'mse':
            # predictions are logits directly
            preds = logits
            loss = mse_loss(preds, y)
            delta = mse_grad(preds, y)  # shape (N, C)
        else:
            # cross-entropy with softmax
            loss = cross_entropy_loss_logits(logits, y)
            delta = grad_softmax_cross_entropy(logits, y)  # dL/dlogits

        # gradient for output layer
        a_prev = caches[-2][1] if self.num_layers > 1 else X
        grads_W[-1] = a_prev.T.dot(delta)
        grads_b[-1] = np.sum(delta, axis=0, keepdims=True)

        # backprop through hidden layers
        delta_prev = delta
        for l in range(self.num_layers - 2, -1, -1):
            z_l, a_l = caches[l]
            if l == 0:
                a_prev = X
            else:
                a_prev = caches[l-1][1]
            W_next = self.W[l+1]
            # propagate delta
            delta = delta_prev.dot(W_next.T) * self.activation_grad(z_l)
            grads_W[l] = a_prev.T.dot(delta)
            grads_b[l] = np.sum(delta, axis=0, keepdims=True)
            delta_prev = delta

        return loss, grads_W, grads_b

    def update_params(self, grads_W, grads_b, lr, momentum):
        for i in range(self.num_layers):
            if momentum > 0:
                self.vW[i] = momentum * self.vW[i] - lr * grads_W[i]
                self.vb[i] = momentum * self.vb[i] - lr * grads_b[i]
                self.W[i] += self.vW[i]
                self.b[i] += self.vb[i]
            else:
                self.W[i] -= lr * grads_W[i]
                self.b[i] -= lr * grads_b[i]

# ------------------- Gradient checking (finite differences) -------------------

def grad_check(model, X, y, epsilon=1e-5, tol=1e-6):
    """Performs gradient checking on model parameters for a small batch X,y.
    Compares analytical gradients to numerical finite-diff gradients.
    """
    _, analytic_grads_W, analytic_grads_b = model.compute_loss_and_grads(X, y)

    # check W's
    for idx, W in enumerate(model.W):
        W_shape = W.shape
        numeric_grad = np.zeros_like(W)
        it = np.nditer(W, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            orig = W[ix]
            W[ix] = orig + epsilon
            loss_plus, _, _ = model.compute_loss_and_grads(X, y)
            W[ix] = orig - epsilon
            loss_minus, _, _ = model.compute_loss_and_grads(X, y)
            W[ix] = orig
            numeric_grad[ix] = (loss_plus - loss_minus) / (2 * epsilon)
            it.iternext()
        diff = np.linalg.norm(analytic_grads_W[idx] - numeric_grad) / (np.linalg.norm(analytic_grads_W[idx]) + np.linalg.norm(numeric_grad) + 1e-12)
        print(f'Grad check W[{idx}]: relative difference = {diff:.8e}')
        if diff > tol:
            print('WARNING: gradient check failed for W', idx)

    # check b's
    for idx, b in enumerate(model.b):
        b_shape = b.shape
        numeric_grad = np.zeros_like(b)
        it = np.nditer(b, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            orig = b[ix]
            b[ix] = orig + epsilon
            loss_plus, _, _ = model.compute_loss_and_grads(X, y)
            b[ix] = orig - epsilon
            loss_minus, _, _ = model.compute_loss_and_grads(X, y)
            b[ix] = orig
            numeric_grad[ix] = (loss_plus - loss_minus) / (2 * epsilon)
            it.iternext()
        diff = np.linalg.norm(analytic_grads_b[idx] - numeric_grad) / (np.linalg.norm(analytic_grads_b[idx]) + np.linalg.norm(numeric_grad) + 1e-12)
        print(f'Grad check b[{idx}]: relative difference = {diff:.8e}')
        if diff > tol:
            print('WARNING: gradient check failed for b', idx)

# ------------------- Small examples / tests -------------------

def run_xor_example():
    print('\nRunning XOR example (binary classification)...')
    # XOR dataset
    X = np.array([[0,0],[0,1],[1,0],[1,1]], dtype=np.float32)
    y = np.array([0,1,1,0], dtype=np.int32)
    y_onehot = one_hot(y, 2)

    model = SimpleMLP(input_dim=2, hidden_sizes=HIDDEN_SIZES, output_dim=2, activation=ACTIVATION)

    if GRAD_CHECK:
        print('Running gradient check on XOR...')
        grad_check(model, X, y_onehot)

    # training loop
    for epoch in range(1, EPOCHS+1):
        # simple full-batch training for XOR
        loss, grads_W, grads_b = model.compute_loss_and_grads(X, y_onehot)
        model.update_params(grads_W, grads_b, LEARNING_RATE, MOMENTUM)
        if epoch % PRINT_EVERY == 0 or epoch == 1:
            preds = model.predict(X)
            acc = np.mean(np.argmax(preds, axis=1) == y)
            print(f'Epoch {epoch} - loss: {loss:.6f} - acc: {acc*100:.2f}%')
    print('Final predictions:', np.argmax(model.predict(X), axis=1))


def run_synthetic_multiclass():
    print('\nRunning synthetic 3-class classification example...')
    # create 3 Gaussian blobs in 2D
    N = 300
    D = 2
    K = 3
    X = np.zeros((N*K, D))
    y = np.zeros(N*K, dtype=np.int32)
    for j in range(K):
        ix = range(N*j, N*(j+1))
        r = np.linspace(0.0,1,N)
        t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.5
        X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
        y[ix] = j

    # shuffle
    perm = np.random.permutation(len(X))
    X = X[perm]
    y = y[perm]

    # train/val split
    split = int(0.8 * len(X))
    X_train, X_val = X[:split], X[split:]
    y_train, y_val = y[:split], y[split:]
    y_train_oh = one_hot(y_train, K)
    y_val_oh = one_hot(y_val, K)

    model = SimpleMLP(input_dim=D, hidden_sizes=HIDDEN_SIZES, output_dim=K, activation=ACTIVATION)

    if GRAD_CHECK:
        print('Running gradient check on small subset...')
        grad_check(model, X_train[:8], y_train_oh[:8])

    # training with mini-batches
    num_batches = int(np.ceil(X_train.shape[0] / BATCH_SIZE))
    for epoch in range(1, EPOCHS+1):
        perm = np.random.permutation(X_train.shape[0])
        X_train_shuffled = X_train[perm]
        y_train_shuffled = y_train_oh[perm]
        epoch_loss = 0.0
        for i in range(num_batches):
            start = i * BATCH_SIZE
            end = start + BATCH_SIZE
            X_batch = X_train_shuffled[start:end]
            y_batch = y_train_shuffled[start:end]
            loss, grads_W, grads_b = model.compute_loss_and_grads(X_batch, y_batch)
            model.update_params(grads_W, grads_b, LEARNING_RATE, MOMENTUM)
            epoch_loss += loss
        # eval
        if epoch % PRINT_EVERY == 0 or epoch == 1:
            preds_val = model.predict(X_val)
            acc_val = np.mean(np.argmax(preds_val, axis=1) == y_val)
            print(f'Epoch {epoch} - avg loss: {epoch_loss/num_batches:.6f} - val acc: {acc_val*100:.2f}%')

    print('Final val acc:', np.mean(np.argmax(model.predict(X_val), axis=1) == y_val))

# ------------------- Main -------------------
if __name__ == '__main__':
    if LOSS == 'mse' and HIDDEN_SIZES == []:
        print('Warning: using MSE with no hidden layers is just linear regression')

    # Run examples
    run_xor_example()
    run_synthetic_multiclass()

    print('\nDone.')


In [None]:
#27. Apply transfer learning using a pre-trained ResNet model for custom image classification.
"""
Transfer learning with a pre-trained ResNet (TensorFlow / Keras)

Usage:
    - Arrange your dataset directory like:
        dataset/
            train/
                class_a/
                class_b/
                ...
            val/
                class_a/
                class_b/
                ...
            test/
                class_a/
                class_b/
                ...

    - Edit HYPERPARAMS below (paths, epochs, batch size, fine-tune flag)
    - Run: python transfer_learning_resnet_keras.py

What it does:
 - Loads images with `image_dataset_from_directory` into tf.data pipelines
 - Builds a model using `tf.keras.applications.ResNet50` (imagenet weights) as a feature extractor
 - Trains the head first with frozen base, then optionally fine-tunes the top of the ResNet
 - Uses data augmentation layers, callbacks, and saves the best model

Notes:
 - Requires TensorFlow 2.6+ (for keras preprocessing & layers API). Tested with TF 2.12.
 - For large datasets / faster training, configure a GPU runtime and increase batch size.
"""

import os
from datetime import datetime
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# -------------------- HYPERPARAMS --------------------
DATA_DIR = 'dataset'  # root directory with train/val/test subfolders
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
INITIAL_EPOCHS = 10
FINE_TUNE_EPOCHS = 10
LEARNING_RATE_HEAD = 1e-3
LEARNING_RATE_FINE = 1e-4
WEIGHTS = 'imagenet'  # or None to train from scratch (not recommended)
RESNET_VERSION = 'ResNet50'  # options: ResNet50, ResNet50V2
POOLING = 'avg'  # 'avg' or 'max'
DROPOUT_RATE = 0.5
FINE_TUNE_AT = 140  # layer index at which to start fine-tuning; None to skip fine-tune
AUTOTUNE = tf.data.AUTOTUNE
MODEL_DIR = 'resnet_transfer_model'
SEED = 42

os.makedirs(MODEL_DIR, exist_ok=True)

# -------------------- Data pipelines --------------------
print('Creating datasets from:', DATA_DIR)
train_dir = os.path.join(DATA_DIR, 'train')
val_dir = os.path.join(DATA_DIR, 'val')
test_dir = os.path.join(DATA_DIR, 'test')

if not os.path.exists(train_dir) or not os.path.exists(val_dir):
    raise FileNotFoundError('Please create train/val (and optionally test) folders under dataset/ with class subfolders.')

train_ds = keras.preprocessing.image_dataset_from_directory(
    train_dir,
    labels='inferred',
    label_mode='categorical',
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    shuffle=True,
    seed=SEED
)

val_ds = keras.preprocessing.image_dataset_from_directory(
    val_dir,
    labels='inferred',
    label_mode='categorical',
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    shuffle=False
)

if os.path.exists(test_dir):
    test_ds = keras.preprocessing.image_dataset_from_directory(
        test_dir,
        labels='inferred',
        label_mode='categorical',
        batch_size=BATCH_SIZE,
        image_size=IMG_SIZE,
        shuffle=False
    )
else:
    test_ds = None

class_names = train_ds.class_names
num_classes = len(class_names)
print('Classes:', class_names)

# Prefetch for performance
train_ds = train_ds.prefetch(AUTOTUNE)
val_ds = val_ds.prefetch(AUTOTUNE)
if test_ds is not None:
    test_ds = test_ds.prefetch(AUTOTUNE)

# -------------------- Data augmentation & preprocessing --------------------
# Use simple augmentation; you can extend this (mixup, cutmix, RandAugment, etc.)
data_augmentation = keras.Sequential([
    layers.RandomFlip('horizontal'),
    layers.RandomRotation(0.05),
    layers.RandomTranslation(0.06, 0.06),
], name='data_augmentation')

# Use ResNet preprocessing (scale pixels as required)
preprocess_input = None
if RESNET_VERSION == 'ResNet50' or RESNET_VERSION == 'ResNet50V2':
    from tensorflow.keras.applications.resnet import preprocess_input as resnet_preprocess
    preprocess_input = resnet_preprocess
else:
    # fallback: simple rescale
    preprocess_input = lambda x: x

# Apply preprocessing in dataset pipeline
def prepare(ds, training=False):
    def _map_fn(x, y):
        x = tf.cast(x, tf.float32)
        x = preprocess_input(x)
        if training:
            x = data_augmentation(x)
        return x, y
    return ds.map(_map_fn, num_parallel_calls=AUTOTUNE)

train_ds_proc = prepare(train_ds, training=True)
val_ds_proc = prepare(val_ds, training=False)
if test_ds is not None:
    test_ds_proc = prepare(test_ds, training=False)
else:
    test_ds_proc = None

# -------------------- Build model --------------------
print('Building model...')
if RESNET_VERSION == 'ResNet50V2':
    base_model = keras.applications.ResNet50V2(weights=WEIGHTS, include_top=False, input_shape=(*IMG_SIZE, 3))
else:
    base_model = keras.applications.ResNet50(weights=WEIGHTS, include_top=False, input_shape=(*IMG_SIZE, 3))

# Freeze the base
base_model.trainable = False

inputs = keras.Input(shape=(*IMG_SIZE, 3))
# optional: include augmentation in model so it also runs during .predict() for debugging
x = inputs
x = data_augmentation(x)
x = preprocess_input(x)

x = base_model(x, training=False)
if POOLING == 'avg':
    x = layers.GlobalAveragePooling2D()(x)
else:
    x = layers.GlobalMaxPooling2D()(x)

x = layers.Dropout(DROPOUT_RATE)(x)
x = layers.Dense(256, activation='relu')(x)
outputs = layers.Dense(num_classes, activation='softmax')(x)

model = keras.Model(inputs, outputs)
model.summary()

# -------------------- Compile & callbacks --------------------
optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE_HEAD)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

now = datetime.now().strftime('%Y%m%d-%H%M%S')
checkpoint_path = os.path.join(MODEL_DIR, f'best_resnet_{now}.h5')
callbacks = [
    keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_accuracy', save_best_only=True, verbose=1),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True, verbose=1),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, verbose=1)
]

# Compute steps per epoch (optional)
train_steps = tf.data.experimental.cardinality(train_ds).numpy()
val_steps = tf.data.experimental.cardinality(val_ds).numpy()
print(f'Train batches: {train_steps}, Val batches: {val_steps}')

# -------------------- Train head --------------------
print('\nTraining head (base frozen) for', INITIAL_EPOCHS, 'epochs...')
history_head = model.fit(
    train_ds_proc,
    epochs=INITIAL_EPOCHS,
    validation_data=val_ds_proc,
    callbacks=callbacks
)

# -------------------- Optional fine-tuning --------------------
if FINE_TUNE_AT is not None:
    print('\nStarting fine-tuning...')
    # Unfreeze from layer index FINE_TUNE_AT onwards
    base_model.trainable = True
    # Freeze earlier layers
    for i, layer in enumerate(base_model.layers):
        if i < FINE_TUNE_AT:
            layer.trainable = False
        else:
            layer.trainable = True

    # recompile with a lower LR
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE_FINE),
                  loss='categorical_crossentropy', metrics=['accuracy'])

    history_fine = model.fit(
        train_ds_proc,
        epochs=INITIAL_EPOCHS + FINE_TUNE_EPOCHS,
        initial_epoch=history_head.epoch[-1] if len(history_head.epoch) > 0 else 0,
        validation_data=val_ds_proc,
        callbacks=callbacks
    )

# -------------------- Evaluate on test set --------------------
if test_ds_proc is not None:
    print('\nEvaluating on test set...')
    best_model = keras.models.load_model(checkpoint_path)
    test_loss, test_acc = best_model.evaluate(test_ds_proc)
    print(f'Test loss: {test_loss:.4f} - Test acc: {test_acc*100:.2f}%')

# -------------------- Save final model and class mapping --------------------
final_path = os.path.join(MODEL_DIR, f'final_resnet_{now}.h5')
model.save(final_path)
print('Saved final model to', final_path)

# Save class names
import json
with open(os.path.join(MODEL_DIR, 'class_names.json'), 'w') as f:
    json.dump(class_names, f)
print('Saved class mapping (class_names.json)')

print('\nDone.')


In [None]:
#28. Train a Transformer-based model (HuggingFace) for text classification.
"""
Fine-tune a Transformer (Hugging Face) for text classification

Usage examples:
  # Fine-tune on GLUE (sst2) via datasets
  python transformer_text_classification_hf.py --dataset_name glue --dataset_config_name sst2

  # Fine-tune on local CSV (columns: text,label)
  python transformer_text_classification_hf.py --train_file ./train.csv --validation_file ./val.csv --text_column text --label_column label

Requirements:
  pip install transformers datasets evaluate accelerate

What it does:
 - Loads dataset (Hugging Face datasets or local CSV/JSON)
 - Auto-detects number of labels and builds label mapping
 - Tokenizes with a selected pretrained model tokenizer
 - Builds a model for sequence classification from a pretrained checkpoint
 - Fine-tunes using Trainer API with evaluation metrics (accuracy, f1)
 - Saves the best checkpoint and final model

Notes:
 - This script uses the Trainer API for convenience. For large datasets / advanced workflows
   consider using Accelerate or custom training loops.
"""

import argparse
import os
from dataclasses import dataclass, field
from typing import Optional, Dict

import numpy as np
from datasets import load_dataset, DatasetDict
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)


def parse_args():
    parser = argparse.ArgumentParser(description='Fine-tune a transformer for text classification')

    # dataset options
    parser.add_argument('--dataset_name', type=str, default=None,
                        help='HuggingFace dataset name (e.g. glue). If provided, dataset_config_name may be needed.')
    parser.add_argument('--dataset_config_name', type=str, default=None,
                        help='Config name for dataset (e.g. sst2 for glue)')
    parser.add_argument('--train_file', type=str, default=None, help='Local train file (csv or json)')
    parser.add_argument('--validation_file', type=str, default=None, help='Local validation file (csv or json)')
    parser.add_argument('--text_column', type=str, default='text', help='Name of the text column in local files')
    parser.add_argument('--label_column', type=str, default='label', help='Name of the label column in local files')

    # model / training options
    parser.add_argument('--model_name_or_path', type=str, default='distilbert-base-uncased',
                        help='Pretrained model identifier from huggingface.co/models')
    parser.add_argument('--output_dir', type=str, default='./hf_text_classifier', help='Where to store checkpoints')
    parser.add_argument('--max_length', type=int, default=128, help='Max sequence length for tokenization')
    parser.add_argument('--per_device_train_batch_size', type=int, default=16)
    parser.add_argument('--per_device_eval_batch_size', type=int, default=32)
    parser.add_argument('--learning_rate', type=float, default=5e-5)
    parser.add_argument('--weight_decay', type=float, default=0.0)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--metric_for_best_model', type=str, default='eval_accuracy')
    parser.add_argument('--greater_is_better', action='store_true', help='Whether larger metric is better')

    args = parser.parse_args()
    return args


def main():
    args = parse_args()
    set_seed(args.seed)

    # ----- Load dataset -----
    if args.dataset_name is not None:
        print(f'Loading dataset {args.dataset_name} {args.dataset_config_name or ""}...')
        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
        # Expect split names train/validation/test depending on dataset
    elif args.train_file is not None and args.validation_file is not None:
        data_files = { 'train': args.train_file, 'validation': args.validation_file }
        print('Loading local files:', data_files)
        raw_datasets = load_dataset('csv' if args.train_file.endswith('.csv') else 'json', data_files=data_files)
    else:
        raise ValueError('You must provide either --dataset_name or both --train_file and --validation_file')

    # Normalize dataset splits: ensure 'train' and 'validation' split exist
    if 'train' not in raw_datasets:
        raise ValueError('Dataset must contain a "train" split')
    if 'validation' not in raw_datasets:
        # if only train/test, split train into train/val
        if 'test' in raw_datasets:
            raw_datasets = DatasetDict({ 'train': raw_datasets['train'], 'validation': raw_datasets['test'] })
        else:
            # create small validation from train
            raw_datasets = raw_datasets['train'].train_test_split(test_size=0.1)

    # ----- Inspect labels and text column -----
    # Try to detect label column
    first_train_example = raw_datasets['train'][0]
    print('Example train row keys:', list(first_train_example.keys()))

    text_column = args.text_column
    label_column = args.label_column
    if text_column not in first_train_example:
        # try common alternatives
        alt_text = [k for k in first_train_example.keys() if k.lower() in ('sentence','text','review','utterance')]
        if alt_text:
            text_column = alt_text[0]
            print(f'Auto-detected text column: {text_column}')
        else:
            raise ValueError('Could not find a text column. Provide --text_column')

    if label_column not in first_train_example:
        # try common alternatives
        alt_label = [k for k in first_train_example.keys() if k.lower() in ('label','labels','rating','stars')]
        if alt_label:
            label_column = alt_label[0]
            print(f'Auto-detected label column: {label_column}')
        else:
            raise ValueError('Could not find a label column. Provide --label_column')

    # If labels are strings -> create mapping
    is_label_str = isinstance(first_train_example[label_column], str)
    if is_label_str:
        labels = sorted(list({ex[label_column] for ex in raw_datasets['train']}))
        label2id = {label: i for i, label in enumerate(labels)}
        id2label = {i: label for label, i in label2id.items()}
        def map_label(example):
            example['label'] = label2id[example[label_column]]
            return example
        raw_datasets = raw_datasets.map(map_label)
        num_labels = len(labels)
        print('Detected string labels. Mapping provided. Num labels =', num_labels)
    else:
        # assume labels are already ints in [0..K-1]
        unique_labels = sorted(list(set(raw_datasets['train'][label_column])))
        num_labels = len(unique_labels)
        print('Detected numeric labels. Num labels =', num_labels)
        # ensure column named 'label'
        if label_column != 'label':
            raw_datasets = raw_datasets.rename_column(label_column, 'label')

    # ----- Load tokenizer & model -----
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        num_labels=num_labels,
    )

    # ----- Tokenize -----
    def preprocess_fn(examples):
        return tokenizer(examples[text_column], truncation=True, max_length=args.max_length)

    tokenized = raw_datasets.map(preprocess_fn, batched=True)

    # ----- Data collator -----
    data_collator = DataCollatorWithPadding(tokenizer)

    # ----- Evaluation metric -----
    accuracy = evaluate.load('accuracy')
    f1 = evaluate.load('f1')

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        acc = accuracy.compute(predictions=predictions, references=labels)
        f1_macro = f1.compute(predictions=predictions, references=labels, average='macro')
        return {
            'accuracy': acc['accuracy'],
            'f1_macro': f1_macro['f1']
        }

    # ----- TrainingArguments & Trainer -----
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=args.learning_rate,
        per_device_train_batch_size=args.per_device_train_batch_size,
        per_device_eval_batch_size=args.per_device_eval_batch_size,
        num_train_epochs=args.num_train_epochs,
        weight_decay=args.weight_decay,
        load_best_model_at_end=True,
        metric_for_best_model=args.metric_for_best_model.replace('eval_',''),
        greater_is_better=args.greater_is_better,
        save_total_limit=2,
        seed=args.seed,
        fp16=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized['train'],
        eval_dataset=tokenized['validation'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # ----- Train -----
    trainer.train()

    # ----- Evaluate on validation (and test if present) -----
    print('\nValidation results:')
    print(trainer.evaluate(tokenized['validation']))

    if 'test' in tokenized:
        print('\nTest results:')
        print(trainer.evaluate(tokenized['test']))

    # ----- Save model & tokenizer -----
    trainer.save_model(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)
    print(f'Saved model and tokenizer to {args.output_dir}')


if __name__ == '__main__':
    main()
