# MNIST Digit Recognition - Model Development

This notebook demonstrates the development and comparison of MLP and CNN models for MNIST digit recognition.

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import time

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

## Data Preparation

In [None]:
# Load and preprocess data
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize pixel values
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Reshape for CNN (add channel dimension)
x_train_cnn = x_train.reshape(-1, 28, 28, 1)
x_test_cnn = x_test.reshape(-1, 28, 28, 1)

# Convert labels to categorical
y_train_cat = to_categorical(y_train, 10)
y_test_cat = to_categorical(y_test, 10)

print(f"Training data shape (MLP): {x_train.shape}")
print(f"Training data shape (CNN): {x_train_cnn.shape}")
print(f"Training labels shape: {y_train_cat.shape}")

## Model Architectures

In [None]:
def create_mlp_model():
    """Create MLP (Multi-Layer Perceptron) model"""
    model = keras.Sequential([
        layers.Flatten(input_shape=(28, 28)),
        layers.Dense(512, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(10, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def create_cnn_model():
    """Create CNN (Convolutional Neural Network) model"""
    model = keras.Sequential([
        layers.Conv2D(32, (3, 3), padding='same', input_shape=(28, 28, 1)),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(10, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

## Model Training

In [None]:
# Create models
mlp_model = create_mlp_model()
cnn_model = create_cnn_model()

# Display model summaries
print("MLP Model Architecture:")
mlp_model.summary()
print("\n" + "="*50 + "\n")
print("CNN Model Architecture:")
cnn_model.summary()

In [None]:
# Training parameters
epochs = 10
batch_size = 128

# Train MLP model
print("Training MLP Model...")
start_time = time.time()
mlp_history = mlp_model.fit(
    x_train, y_train_cat,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    verbose=1
)
mlp_training_time = time.time() - start_time
print(f"MLP Training completed in {mlp_training_time:.2f} seconds")

In [None]:
# Train CNN model
print("Training CNN Model...")
start_time = time.time()
cnn_history = cnn_model.fit(
    x_train_cnn, y_train_cat,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
    verbose=1
)
cnn_training_time = time.time() - start_time
print(f"CNN Training completed in {cnn_training_time:.2f} seconds")

## Model Evaluation

In [None]:
# Evaluate models
mlp_test_loss, mlp_test_acc = mlp_model.evaluate(x_test, y_test_cat, verbose=0)
cnn_test_loss, cnn_test_acc = cnn_model.evaluate(x_test_cnn, y_test_cat, verbose=0)

print(f"MLP Test Accuracy: {mlp_test_acc:.4f}")
print(f"CNN Test Accuracy: {cnn_test_acc:.4f}")
print(f"MLP Training Time: {mlp_training_time:.2f}s")
print(f"CNN Training Time: {cnn_training_time:.2f}s")

## Training History Visualization

In [None]:
# Plot training history
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Accuracy plots
axes[0, 0].plot(mlp_history.history['accuracy'], label='MLP Train', color='blue')
axes[0, 0].plot(mlp_history.history['val_accuracy'], label='MLP Val', color='blue', linestyle='--')
axes[0, 0].set_title('MLP Model Accuracy')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].plot(cnn_history.history['accuracy'], label='CNN Train', color='red')
axes[0, 1].plot(cnn_history.history['val_accuracy'], label='CNN Val', color='red', linestyle='--')
axes[0, 1].set_title('CNN Model Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Loss plots
axes[1, 0].plot(mlp_history.history['loss'], label='MLP Train', color='blue')
axes[1, 0].plot(mlp_history.history['val_loss'], label='MLP Val', color='blue', linestyle='--')
axes[1, 0].set_title('MLP Model Loss')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].plot(cnn_history.history['loss'], label='CNN Train', color='red')
axes[1, 1].plot(cnn_history.history['val_loss'], label='CNN Val', color='red', linestyle='--')
axes[1, 1].set_title('CNN Model Loss')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Loss')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Confusion Matrices

In [None]:
# Generate predictions
mlp_pred = np.argmax(mlp_model.predict(x_test, verbose=0), axis=1)
cnn_pred = np.argmax(cnn_model.predict(x_test_cnn, verbose=0), axis=1)

# Create confusion matrices
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# MLP Confusion Matrix
cm_mlp = confusion_matrix(y_test, mlp_pred)
sns.heatmap(cm_mlp, annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_title('MLP Confusion Matrix')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')

# CNN Confusion Matrix
cm_cnn = confusion_matrix(y_test, cnn_pred)
sns.heatmap(cm_cnn, annot=True, fmt='d', cmap='Reds', ax=ax2)
ax2.set_title('CNN Confusion Matrix')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')

plt.tight_layout()
plt.show()

## Performance Summary

In [None]:
# Classification reports
print("MLP Classification Report:")
print(classification_report(y_test, mlp_pred))
print("\n" + "="*50 + "\n")
print("CNN Classification Report:")
print(classification_report(y_test, cnn_pred))

## Save Models

In [None]:
# Save trained models
mlp_model.save('../models/mlp_baseline.h5')
cnn_model.save('../models/mnist_cnn.h5')
print("Models saved successfully!")

## Conclusion

This notebook demonstrates:
1. Implementation of MLP and CNN architectures
2. Training both models on MNIST dataset
3. Performance comparison between architectures
4. Model persistence for later use

The CNN typically achieves higher accuracy due to its ability to capture spatial features in images.