# Bystrov Mikhail. Homework 3

In [5]:
!wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/refs/heads/fall25/week03_convnets/cifar.py

--2025-09-26 07:52:35--  https://raw.githubusercontent.com/yandexdataschool/Practical_DL/refs/heads/fall25/week03_convnets/cifar.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2396 (2.3K) [text/plain]
Saving to: ‘cifar.py.1’


2025-09-26 07:52:35 (48.4 MB/s) - ‘cifar.py.1’ saved [2396/2396]



In [6]:
import numpy as np
from cifar import load_cifar10
X_train, y_train, X_val, y_val, X_test, y_test = load_cifar10("cifar_data")

class_names = np.array(['airplane', 'automobile', 'bird', 'cat', 'deer',
                        'dog', 'frog', 'horse', 'ship', 'truck'])

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import matplotlib.pyplot as plt
%matplotlib inline
import time

In [8]:
model = nn.Sequential(
    nn.Conv2d(3, 32, kernel_size=(3,3), padding=1),
    nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.Conv2d(32, 64, kernel_size=(3,3), padding=1),
    nn.BatchNorm2d(64),
    nn.MaxPool2d((2, 2)),
    nn.LeakyReLU(0.1),
    nn.Conv2d(64, 128, kernel_size=(3,3), padding=1),
    nn.BatchNorm2d(128),
    nn.ReLU(),
    nn.Conv2d(128, 256, kernel_size=(3,3), padding=1),
    nn.BatchNorm2d(256),
    nn.MaxPool2d((2, 2)),
    nn.LeakyReLU(0.1),
    nn.Conv2d(256, 512, kernel_size=(3,3), padding=1),
    nn.BatchNorm2d(512),
    nn.MaxPool2d((2, 2)),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(512 * 4 * 4, 256),
    nn.LeakyReLU(0.1),
    nn.Dropout(0.25),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.25),
    nn.Linear(128, 10)
).to(device)

In [9]:
opt = torch.optim.Adam(model.parameters())

train_loss = []
val_accuracy = []

# An auxilary function that returns mini-batches for neural network training
def iterate_minibatches(X, y, batchsize):
    indices = np.random.permutation(np.arange(len(X)))
    for start in range(0, len(indices), batchsize):
        ix = indices[start: start + batchsize]
        yield X[ix], y[ix]

In [10]:
def compute_loss(X_batch, y_batch):
    X_batch = torch.as_tensor(X_batch, dtype=torch.float32, device=device)
    y_batch = torch.as_tensor(y_batch, dtype=torch.int64, device=device)
    logits = model(X_batch)
    return F.cross_entropy(logits, y_batch).mean()

In [11]:
import os

opt = torch.optim.Adam(model.parameters())

train_loss = []
val_accuracy = []

num_epochs = 100 # total amount of full passes over training data
batch_size = 50  # number of samples processed in one SGD iteration

best_accuracy = -1.0
best_epoch = -1

for epoch in range(num_epochs):
    # In each epoch, we do a full pass over the training data:
    start_time = time.time()
    model.train(True) # enable dropout / batch_norm training behavior
    for X_batch, y_batch in iterate_minibatches(X_train, y_train, batch_size):
        # train on batch
        loss = compute_loss(X_batch, y_batch)
        loss.backward()
        opt.step()
        opt.zero_grad()
        train_loss.append(loss.item())  # .item() = convert 1-value Tensor to float

    # And a full pass over the validation data:
    model.train(False)     # disable dropout / use averages for batch_norm
    with torch.no_grad():  # do not store intermediate activations
        epoch_val_accuracy = [] # Calculate accuracy for the current epoch's validation pass
        for X_batch, y_batch in iterate_minibatches(X_val, y_val, batch_size):
            logits = model(torch.as_tensor(X_batch, dtype=torch.float32, device=device))
            y_pred = logits.argmax(-1).detach().to("cpu").numpy()
            epoch_val_accuracy.append(np.mean(y_batch == y_pred))

        current_accuracy = np.mean(epoch_val_accuracy) # Mean accuracy for the current epoch
        val_accuracy.append(current_accuracy) # Append epoch accuracy to the list


    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time))
    print("  training loss (in-iteration): \t{:.6f}".format(
        np.mean(train_loss[-len(X_train) // batch_size :])))
    print("  validation accuracy: \t\t\t{:.2f} %".format(
        current_accuracy * 100)) # Use current_accuracy here

    # Early stopping
    if current_accuracy > best_accuracy:
      best_accuracy = current_accuracy
      best_epoch = epoch
      torch.save(model.state_dict(), "best_state.pt")
    elif epoch - best_epoch > 10:
      print(f"  Validation accuracy has not improved for 10 epochs. Stopping early at epoch {epoch + 1}.")
      break


# Load the best model state
if os.path.exists("best_state.pt"):
    model.load_state_dict(torch.load("best_state.pt"))
    print("Loaded best model state.")


model.train(False) # disable dropout / use averages for batch_norm
test_batch_acc = []
with torch.no_grad():
    for X_batch, y_batch in iterate_minibatches(X_test, y_test, 500):
        logits = model(torch.as_tensor(X_batch, dtype=torch.float32, device=device))
        y_pred = logits.max(1)[1].detach().cpu().numpy()
        test_batch_acc.append(np.mean(y_batch == y_pred))

test_accuracy = np.mean(test_batch_acc)

print("Final results:")
print("  test accuracy:\t\t{:.2f} %".format(
    test_accuracy * 100))

Epoch 1 of 100 took 12.929s
  training loss (in-iteration): 	1.553493
  validation accuracy: 			56.77 %
Epoch 2 of 100 took 11.715s
  training loss (in-iteration): 	1.074368
  validation accuracy: 			65.10 %
Epoch 3 of 100 took 11.824s
  training loss (in-iteration): 	0.860482
  validation accuracy: 			72.89 %
Epoch 4 of 100 took 11.978s
  training loss (in-iteration): 	0.716502
  validation accuracy: 			77.04 %
Epoch 5 of 100 took 12.088s
  training loss (in-iteration): 	0.613991
  validation accuracy: 			76.33 %
Epoch 6 of 100 took 12.239s
  training loss (in-iteration): 	0.516839
  validation accuracy: 			76.60 %
Epoch 7 of 100 took 12.381s
  training loss (in-iteration): 	0.427204
  validation accuracy: 			79.13 %
Epoch 8 of 100 took 12.868s
  training loss (in-iteration): 	0.349260
  validation accuracy: 			82.75 %
Epoch 9 of 100 took 12.629s
  training loss (in-iteration): 	0.288750
  validation accuracy: 			82.53 %
Epoch 10 of 100 took 12.239s
  training loss (in-iteration): 	0.

### Why did it work?
- First of all, the increase in the complexity of the neural network in 5 convolutional layers helped.
- Using small $3 \times 3$ convolutions with padding preserves spatial resolution while keeping the parameter count efficient.
- Batch normalization and a mix of ReLU/LeakyReLU activations stabilize training and improve gradient flow.
- MaxPooling layers reduce spatial dimensions, focusing the deeper layers on more abstract features.
- Finally, dropout in the fully connected layers prevents overfitting and improves generalization.