# MNIST Dataset - Exploratory Data Analysis

This notebook provides detailed exploratory data analysis of the MNIST handwritten digit dataset.

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import mnist
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

## Load the Dataset

In [None]:
# Load MNIST data
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print(f"Training set shape: {x_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test set shape: {x_test.shape}")
print(f"Test labels shape: {y_test.shape}")

## Basic Statistics

In [None]:
# Dataset statistics
print("Dataset Statistics:")
print(f"Total training samples: {len(x_train):,}")
print(f"Total test samples: {len(x_test):,}")
print(f"Image shape: {x_train[0].shape}")
print(f"Number of classes: {len(np.unique(y_train))}")
print(f"Classes: {np.unique(y_train)}")
print(f"Pixel value range: [{x_train.min()}, {x_train.max()}]")

## Class Distribution

In [None]:
# Class distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Training set distribution
unique_train, counts_train = np.unique(y_train, return_counts=True)
ax1.bar(unique_train, counts_train, alpha=0.7, color='skyblue')
ax1.set_title('Training Set - Class Distribution')
ax1.set_xlabel('Digit')
ax1.set_ylabel('Count')
ax1.grid(True, alpha=0.3)

# Test set distribution
unique_test, counts_test = np.unique(y_test, return_counts=True)
ax2.bar(unique_test, counts_test, alpha=0.7, color='lightcoral')
ax2.set_title('Test Set - Class Distribution')
ax2.set_xlabel('Digit')
ax2.set_ylabel('Count')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Sample Images

In [None]:
# Display sample images for each digit
fig, axes = plt.subplots(2, 10, figsize=(20, 6))
fig.suptitle('Sample Images from Each Class', fontsize=16)

for digit in range(10):
    # Find indices for current digit
    digit_indices = np.where(y_train == digit)[0]
    
    # Show first occurrence
    axes[0, digit].imshow(x_train[digit_indices[0]], cmap='gray')
    axes[0, digit].set_title(f'Digit: {digit}')
    axes[0, digit].axis('off')
    
    # Show second occurrence
    axes[1, digit].imshow(x_train[digit_indices[1]], cmap='gray')
    axes[1, digit].axis('off')

plt.tight_layout()
plt.show()

## Pixel Intensity Analysis

In [None]:
# Pixel intensity statistics
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Overall pixel distribution
axes[0, 0].hist(x_train.flatten(), bins=50, alpha=0.7, color='blue')
axes[0, 0].set_title('Pixel Intensity Distribution (Training Set)')
axes[0, 0].set_xlabel('Pixel Value')
axes[0, 0].set_ylabel('Frequency')

# Mean pixel intensity by class
class_means = []
for digit in range(10):
    digit_images = x_train[y_train == digit]
    class_means.append(digit_images.mean())

axes[0, 1].bar(range(10), class_means, alpha=0.7, color='green')
axes[0, 1].set_title('Mean Pixel Intensity by Class')
axes[0, 1].set_xlabel('Digit')
axes[0, 1].set_ylabel('Mean Pixel Value')

# Sample image with pixel values
sample_img = x_train[0]
im = axes[1, 0].imshow(sample_img, cmap='gray')
axes[1, 0].set_title(f'Sample Image (Label: {y_train[0]})')
plt.colorbar(im, ax=axes[1, 0])

# Normalized version
normalized_img = sample_img / 255.0
im2 = axes[1, 1].imshow(normalized_img, cmap='gray')
axes[1, 1].set_title('Normalized Version')
plt.colorbar(im2, ax=axes[1, 1])

plt.tight_layout()
plt.show()

## Summary

This EDA reveals:
1. MNIST contains 60,000 training and 10,000 test images
2. Images are 28x28 pixels in grayscale
3. Classes are relatively balanced
4. Pixel values range from 0-255
5. Most pixels are black (0) with white digits