# Train your first neural network: basic classification

This notebook trains a neural network model to classify images of Zalando clothing articles, like sneakers and shirts etc. It's okay if you don't understand all the details, we will explain more and you can always ask!

This guide uses PyTorch, a deep learning framework that provides flexible and efficient tools for building and training models.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(torch.__version__)

## Import the Fashion MNIST dataset

This guide uses the Fashion MNIST dataset which contains 70,000 grayscale images in 10 categories. The images show individual articles of clothing at low resolution (28 by 28 pixels), as seen here:

<table>
  <tr><td>
    <img src="https://tensorflow.org/images/fashion-mnist-sprite.png"
         alt="Fashion MNIST sprite"  width="600">
  </td></tr>
  <tr><td align="center">
    <b>Figure 1.</b> <a href="https://github.com/zalandoresearch/fashion-mnist">Fashion-MNIST samples</a> (by Zalando, MIT License).<br/>&nbsp;
  </td></tr>
</table>

Fashion MNIST is intended as a drop-in replacement for the classic MNIST dataset. We will use 60,000 images to train the network and 10,000 images to evaluate accuracy.

In [None]:
# Define transforms
transform = transforms.Compose([
    transforms.ToTensor(),  # Converts to tensor and normalizes to [0, 1]
])

# Load Fashion MNIST dataset
train_dataset = datasets.FashionMNIST(root='./data', train=True, 
                                       download=True, transform=transform)
test_dataset = datasets.FashionMNIST(root='./data', train=False, 
                                      download=True, transform=transform)

train_images = train_dataset.data.float() / 255.0
train_labels = train_dataset.targets
test_images = test_dataset.data.float() / 255.0
test_labels = test_dataset.targets

Loading the dataset returns tensors:

* The `train_images` and `train_labels` tensors are the training set—the data the model uses to learn.
* The model is tested against the test set, the `test_images`, and `test_labels` tensors.

The images are 28x28 tensors, with pixel values normalized to [0, 1]. The labels are integers from 0 to 9 corresponding to clothing classes.

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

## Explore the data

Let's explore the format of the dataset before training the model.

In [None]:
print(f'Training set shape: {train_images.shape}')
print(f'Training labels shape: {train_labels.shape}')
print(f'Test set shape: {test_images.shape}')
print(f'Test labels shape: {test_labels.shape}')

## Preprocess the data

The data must be preprocessed before training the network.

In [None]:
# Visualize the first image
plt.figure()
plt.imshow(train_images[0], cmap=plt.cm.binary)
plt.colorbar()
plt.grid(False)
plt.show()

Display the first 25 images from the training set and verify the data is in the correct format.

In [None]:
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    plt.xlabel(class_names[train_labels[i]])
plt.show()

## Build the model

Building the neural network requires configuring the layers of the model.

### Setup the layers

Most of deep learning consists of chaining together simple layers. Most layers, like nn.Linear, have parameters that are learned during training.

In [None]:
class FashionMNISTNet(nn.Module):
    def __init__(self):
        super(FashionMNISTNet, self).__init__()
        self.flatten = nn.Flatten()
        self.dense1 = nn.Linear(28 * 28, 128)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dense2(x)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = FashionMNISTNet().to(device)

print(model)

The first layer flattens the 28x28 images into a 784-dimensional vector. The network then has a 128-neuron hidden layer with ReLU activation, followed by a 10-neuron output layer for the 10 clothing classes.

In [None]:
# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Create DataLoaders
train_dataset_torch = TensorDataset(train_images, train_labels)
test_dataset_torch = TensorDataset(test_images, test_labels)

train_loader = DataLoader(train_dataset_torch, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset_torch, batch_size=32, shuffle=False)

## Train the model

We'll now train the model using a custom training loop.

In [None]:
def train_epoch(model, train_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = loss_fn(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Metrics
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    avg_loss = total_loss / len(train_loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy

# Train for 5 epochs
num_epochs = 5
train_losses = []
train_accuracies = []

for epoch in range(num_epochs):
    loss, accuracy = train_epoch(model, train_loader, loss_fn, optimizer, device)
    train_losses.append(loss)
    train_accuracies.append(accuracy)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss:.4f}, Accuracy: {accuracy:.2f}%')

## Evaluate accuracy

Next, evaluate the model performance on the test dataset.

In [None]:
def evaluate(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    return accuracy

test_accuracy = evaluate(model, test_loader, device)
print(f'Test accuracy: {test_accuracy:.2f}%')

The accuracy on the test dataset is slightly less than on the training dataset. This gap is an example of overfitting—when a model performs worse on new data than on training data.

## Make predictions

With the model trained, we can use it to make predictions about images.

In [None]:
def make_predictions(model, images, device):
    model.eval()
    with torch.no_grad():
        outputs = model(images.to(device))
        probabilities = torch.softmax(outputs, dim=1)
    return probabilities.cpu().numpy()

predictions = make_predictions(model, test_images, device)

A prediction is an array of 10 numbers representing the model's confidence for each of the 10 clothing classes.

In [None]:
print('First prediction:')
print(predictions[0])
print(f'\nPredicted class: {np.argmax(predictions[0])}')
print(f'True class: {test_labels[0].item()}')

In [None]:
def plot_image(i, predictions_array, true_label, img):
    predictions_array = predictions_array[i]
    true_label = true_label[i]
    img = img[i]
    
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])
    
    plt.imshow(img, cmap=plt.cm.binary)
    
    predicted_label = np.argmax(predictions_array)
    if predicted_label == true_label:
        color = 'blue'
    else:
        color = 'red'
    
    plt.xlabel("{} {:2.0f}% ({})".format(class_names[predicted_label],
                                    100*np.max(predictions_array),
                                    class_names[true_label]),
                                    color=color)

def plot_value_array(i, predictions_array, true_label):
    predictions_array = predictions_array[i]
    true_label = true_label[i]
    
    plt.grid(False)
    plt.xticks([])
    plt.yticks([])
    thisplot = plt.bar(range(10), predictions_array, color="#777777")
    plt.ylim([0, 1])
    
    predicted_label = np.argmax(predictions_array)
    thisplot[predicted_label].set_color('red')
    thisplot[true_label].set_color('blue')

In [None]:
i = 0
plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
plot_image(i, predictions, test_labels, test_images)
plt.subplot(1,2,2)
plot_value_array(i, predictions, test_labels)
plt.show()

In [None]:
# Plot the first X test images with their predictions
num_rows = 5
num_cols = 3
num_images = num_rows * num_cols
plt.figure(figsize=(2*2*num_cols, 2*num_rows))
for i in range(num_images):
    plt.subplot(num_rows, 2*num_cols, 2*i+1)
    plot_image(i, predictions, test_labels, test_images)
    plt.subplot(num_rows, 2*num_cols, 2*i+2)
    plot_value_array(i, predictions, test_labels)
plt.tight_layout()
plt.show()