In [2]:
import torch
import numpy as np
print(torch.__version__)

2.8.0+cu126


In [19]:
!nvidia-smi

Sun Sep 21 09:40:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   57C    P0             27W /   70W |     124MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [14]:
t = torch.rand(356, 64, 64)

In [35]:
t_torch = torch.rand(356, 64, 64)
print(t_torch.size())

t_np = t_torch.numpy()
print(t_np.shape)


torch.Size([356, 64, 64])
(356, 64, 64)


In [16]:
t = t.cuda() #moving the tensor t to GPU:0 (we can have more GPUs)

In [17]:
t1 = torch.rand(356, 64, 64) #same shape of t

In [18]:
t2 = t + t1 #fails because all tensors must be to the sampe hardware (CPU or GPU)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

In [21]:
#NUMPY ARRAYS, we can do the same

arr1 = np.random.random((6,3))
print(f'random array: \n {arr1} \n')

arr2 = np.zeros((6,3), dtype = np.float32)
print(f'zeros array: \n {arr2} \n')

arr3 = np.ones((6,3), dtype = np.float32)
print(f'ones array: \n {arr3} \n')



random array: 
 [[0.46927862 0.93788905 0.70780517]
 [0.85295074 0.93868899 0.55339853]
 [0.96630035 0.90333889 0.74412538]
 [0.67295907 0.42939896 0.55267707]
 [0.42328624 0.59648679 0.84939562]
 [0.45683561 0.14777844 0.24625424]] 

zeros array: 
 [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]] 

ones array: 
 [[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]] 



In [26]:
#convert from numpy to torch

tensor = torch.from_numpy(arr1) #moved to tensor torch so now it can be moved to GPU
print(tensor.size())
print(tensor.dtype)
print(tensor.device)

torch.Size([6, 3])
torch.float64
cpu


In [28]:
#from tensor to numpy
arr1 = tensor.numpy()
print(arr1)
print(tensor)


[[0.46927862 0.93788905 0.70780517]
 [0.85295074 0.93868899 0.55339853]
 [0.96630035 0.90333889 0.74412538]
 [0.67295907 0.42939896 0.55267707]
 [0.42328624 0.59648679 0.84939562]
 [0.45683561 0.14777844 0.24625424]]
tensor([[0.4693, 0.9379, 0.7078],
        [0.8530, 0.9387, 0.5534],
        [0.9663, 0.9033, 0.7441],
        [0.6730, 0.4294, 0.5527],
        [0.4233, 0.5965, 0.8494],
        [0.4568, 0.1478, 0.2463]], dtype=torch.float64)


# How to train a neural network

we basically have to do 4 steps and repeat untile we converge

1) FROWARD PASS: we feed an input into the network and see its predictions --> array of probabilities, one for each class. Then we take the major one and see the result

2) COMPUTE LOSS: we compare the outcome with the ground truth with a loss function

3) BACKWARD PASS: Chan Rule to compute the gradient of the loss in respect to the parameters (PYTORCH AUTOGRAD)

4) UPDATE PARAMETERS: we update the parameters as the opposite of the gradient in that direction, with a given learning rate (PYTORCH OPTIMIZER)

--> REPEAT FROM 1 UNTIL CONVERGE

# Computational graph
During the training, the network changes because at each step we update its parameters, so we have to keep track of these changes looking at the computational graph (e. g. Full Connected Layer --> TANH --> ...), and this is managed automaticcaly by AUTOGRAD

# Basic Components to train a Network

1) DEFINE THE NETWORK ARCHITECTURE
A subclass of torch.nn.Module

2) DEFINE A DATASET
A subclass of torch.utils.data.Dataset

3) DEFINE LOSS FUNCTION and OPTIMIZER
Define network prediction penalization (rergularization) and how to update parametres (learning rate, momentum exc..)

4) DEFINE THE TRAINING LOOP
Sort of main function interconnecting all components of the nn

In [None]:
#DEFINE A SIMPLE MLP

import torch.nn as nn

#Fully connected nn with 1 hidden layer
class NeuralNetwork(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes): #define architecture of nn
    super().__init__() #stiamo estendendo nn.Module
    self.fc1 =  nn.Linear(input_size, hidden_size) #input size = #features, mentre hidden_size = #neurons
    self.relu = nn.ReLU() #define activation
    self.fc2 =  nn.Linear(hidden_size, num_classes) #ora il numero di features in input sono hidden_size usciti da prima
    #essendo l'output layer, ho tanti neuroni quante classi e caccio probabilita per ogni classe

    def forward(self, x): #define forward pass
      h = self.fc1(x) #compute intermediate output h
      h_act = self.relu(h) #relu
      out = self.fc2(h_act) #output
      return out


In [30]:
# CONNECT nn TO INPUTS

input_size, hidden_size, num_classes = 784, 500, 10
model = NeuralNetwork(input_size, hidden_size, num_classes)
print(f'my model: {model}')

my model: NeuralNetwork(
  (fc1): Linear(in_features=784, out_features=500, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=500, out_features=10, bias=True)
)


In [None]:
# pass input to model for FORWARD PASS

img = torch.tensor([[[48, 80, 79],
                   [127, 111, 129],
                   [118, 130, 139],
                   [148, 110, 119]]]) #rgb image, must have 784 features, so for a FCL when we flat this img to 1xD tensor it must be 1x784

res = model(img) #gives forward pass results

# DATA PREPROCESSING
we need a batch of images on which work with our nn, so we need
1) DEFINE PREPROCESSING
2) CREATE DATSET CLASS (bunch of images)
3) CHOOSE SAMPLING STRATEGY

In [None]:
# DEFINE DATASET

import pandas as pd
from torch.utils.data import Dataset
from torchvision import transforms

class DatasetMNIST(Dataset):
  def __init__(self, file_path, transform = None):
    self.data = pd.read_csv(file_path)
    self.transform = transform

  def __len__(self): #mandatory to override
    return len(self.data) #return all data

  def __getitem__(self, index): #mandatory to override
    image = self.data.iloc[index, 1:].values.astype(np.uint8).reshape(1, 28, 28) #PYTORCH usa convenzione channel first per motivi di efficienza
    label = self.data.iloc[index, 0]

    if self.transform is not None:
      image = self.transform(image) #execute all preprocessing pipeline

    return image, label


transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.transforms.RandomHorizontalFlip(0.5),
    transforms.ToTensor(),
    ])

train_dataset = DatasetMNIST(file_path='.../input/train.csv')

But when we execute the training, we would to do it on a batch of images and not only on one, so we use DataLoader

In [None]:
import torchvision
from torch.utils.data import DataLoader

train_loader = DataLoader(
    torchvision.datasets.MNIST('/files/', train=True, download=True,
                               transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize((0.1307,), (0.3081,))
                               ])),
    batch_size = batch_size_train, shuffle=True
)

# LOSS
extends torch.nn class

In [None]:
loss_fn = nn.CrossEntropyLoss() #example for classification problems

pred = torch.rand(3, 5, requires_grad=True) #3 samples classificati con probabilita p_i in ognuna delle 5 classi c_i
ground_truth = torch.empty(3, dtype=torch.long).random_(5) #3 numeri casuali da 0 a 5

loss = loss_fn(pred, ground_truth)

loss.backward() #compute gradients

# OPTIMIZER
Now we need to update the parameters after cmputing the backward

In [None]:
from torch import optim
#we pass at least parameters + learning rate as basic parameter to the optimizer

#STOCHSTIC GRADIENT DESCENT
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) #momentum=lambda in the formula of momentum

#ADAPTIVE MOMENT ESTIMATION
optimizer = optim.Adam([var1, var2], lr=0.0001)

#function
optimizer.step() # --> it is updating parameters


#FULL FLOW
def flow(dataset):
  for input, target in dataset:
    optimizer.zero_grad() #reset gradient
    pred = model(input) #prediction after forware pass
    loss = loss_fn(pred, target) #calculate loss
    loss.backward() #compute gradients for each parameter
    optimizer.step() #update parameters

#TRAINING A NEURAL NETWROK
We train NN in epochs, an epoch is a full iteration on the dataset, so for example if we have 1000 images, divided in 10 batches of 100 images, when we have given all 10 batches to the model the first time we have completed the first epoch.

Considering that ther isn't a clear stop point to te training phase (usually for complex nn it is impossible to reach a minimum in loss function) we can decide a number of epochs for the training and iterate over them.

A better option is to use a validation set to compute the errors at each epoch and stop training when this error doesn't improve.

Other options can be: learning rate scheduling --> decrease learning rate gradually to get close to the minimum

In [None]:
criterion = nn.CrossEntropyLoss() #IMPORTANT --> as we have seen in thery, this loss function is thought for multiclass classification problems, so it has a softmax inside
#that make all the outputs sum up to 1 like probabilities should do, so we don't have to worry about normilize outputs with this loss
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# IF WE WANT TO TRAIN ON GPU:
# model = model.cuda()
for epoch in range(100):
  tot_loss, tot_samples = 0.0, 0
  for i, data in enumerate(train_dataloader):
    inputs, labels = data
    tot_samples += inputs.size(0)

    # IF WE WANT TO TRAIN ON GPU:
    # inputs = inputs.cuda()
    # labels = labels.cuda()
    #! --> thi is possible because we don't transfer all dataset on GPU, but only one batch at a time
    #
    #

    #zeroing the gradients of weights, otherwise gradients of different batches will sum
    optimizer.zero_grad() #after backward, the value of layer.weights.grad will bel != 0 and this is nopt good for next gradients calculation
    preds = model(inputs) #all batch of inputs
    loss = criterion(preds, labels)
    loss.backward()
    optimizer.step()

    tot_loss += loss.item() * inputs.size(0) #.item() converte loss in un numero
    #questo numero e la media del loss rispetto al batch, quindi moltiplicando per il batch_size otteniamo il loss totale del batch
    #infatti noi stiamno mandando nel modello non un input, ma tanti input insieme (inputs_size), quidi ci saranno inputs_size NN parallele che calcoleranno
    #la cross entropy, es. inputs_size = 10 --> 10 numeri di loss, quindi loss_item() prende la media

  #end of epoch, calculate avergae loss for sample (in entire dataset)
  print(f'Epoch {epoch} average loss for sample is: {(tot_loss*1.0/float(tot_samples)):.6f}') #to see when we converge if there is a plateau across the epochs