# Beginner tutorial on PyTorch an ANN - Articficial Neural Network

In [2]:
import torch
import numpy as np
import sys

In [3]:
# deep learning can use CPU or GPU
torch.cuda.is_available()

False

In [4]:
# torch.device('cuda0')
device = torch.device('cpu')
device

device(type='cpu')

## Let's start

<img src = 'japan.png'/>

In [6]:
'''
m = 5
n = 3 --> temp, rain, humidity
y = yield of apples and orange
'''

'\nm = 5\nn = 3 --> temp, rain, humidity\ny = yield of apples and orange\n'

## Step 1 . Specify X and y

In [7]:
# Input (temp, rainfall, humidity)
X_train = np.array([[73, 67, 43], [91, 88, 64], [87, 134, 58], 
                   [102, 43, 37], [69, 96, 70], [73, 67, 43], 
                   [91, 88, 64], [87, 134, 58], [102, 43, 37], 
                   [69, 96, 70], [73, 67, 43], [91, 88, 64], 
                   [87, 134, 58], [102, 43, 37], [69, 96, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
y_train = np.array([[56, 70], [81, 101], [119, 133], 
                    [22, 37], [103, 119], [56, 70], 
                    [81, 101], [119, 133], [22, 37], 
                    [103, 119], [56, 70], [81, 101], 
                    [119, 133], [22, 37], [103, 119]], 
                   dtype='float32')

In [8]:
# have to convert numpy to torch tesors
# torch tensors are simple numpy version of pytorch
inputs = torch.from_numpy(X_train)
targets = torch.from_numpy(y_train)

# torch.Tensor
type(inputs), type(targets) 
'''
torch inclued gradient descent for every steps
'''

inputs.shape, targets.shape
inputs.size(), targets.size()

(torch.Size([15, 3]), torch.Size([15, 2]))

## 2. Dataloaders

Remember in ML, we have to do the batch learning outselve, like mini-batch, stochastic batch. PyTorch has a class called DataLoaders that automatically do this for you.

It's optional whether you wanna use it or just use it - don't reinvent the wheel

In [9]:
# define a dataset so that dataloaders understand
from torch.utils.data import TensorDataset

train_dataset = TensorDataset (inputs, targets)

train_dataset[0]

(tensor([73., 67., 43.]), tensor([56., 70.]))

In [10]:
from torch.utils.data import DataLoader

batch_size = 3 # for no reason - here i am using  a mini-batch of size 3

train_dl = DataLoader (train_dataset, batch_size, shuffle=False) # you should use Shuffle = True

In [11]:
for x, y in train_dl:
    print("------new batch--------")
    print(x) # this will give you batch_size of x, e.g. 3 sets of x
    print(y) # this will give you batch_size of y, e.g. 3 sets of y

------new batch--------
tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.]])
------new batch--------
tensor([[102.,  43.,  37.],
        [ 69.,  96.,  70.],
        [ 73.,  67.,  43.]])
tensor([[ 22.,  37.],
        [103., 119.],
        [ 56.,  70.]])
------new batch--------
tensor([[ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.]])
tensor([[ 81., 101.],
        [119., 133.],
        [ 22.,  37.]])
------new batch--------
tensor([[69., 96., 70.],
        [73., 67., 43.],
        [91., 88., 64.]])
tensor([[103., 119.],
        [ 56.,  70.],
        [ 81., 101.]])
------new batch--------
tensor([[ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[119., 133.],
        [ 22.,  37.],
        [103., 119.]])


## 3. Define some layers

don't be confuse about layers. it's simply matrix multiplication. It's true until now.

In [12]:
import torch.nn as nn  # nn ==> neural networt

In [13]:
# define Linear Layer: 3 is the incoming features and 2 is the output feature
some_random_layer = nn.Linear(3,2)

In [14]:
# matrix --> randomly initialize --> using standardization methods
print (some_random_layer.weight)
print (some_random_layer.weight.shape)
print (some_random_layer.bias)
print (some_random_layer.bias.shape)

Parameter containing:
tensor([[ 0.5153,  0.5054,  0.3009],
        [ 0.3160, -0.3148, -0.5201]], requires_grad=True)
torch.Size([2, 3])
Parameter containing:
tensor([ 0.2187, -0.2698], requires_grad=True)
torch.Size([2])


In [15]:
outputs = some_random_layer (inputs) # do matrix multiplication of incoming layer
# input = (15,3) @ (3,2) = (15,2)

outputs.shape

torch.Size([15, 2])

In [16]:
# how do chatgpt count their number of parameters - 175b parameters

# count their number of parameters
# some_random_layer.parameters()

In [17]:
# count their number of parameters
total_num_of_param = 0
for param in some_random_layer.parameters():
    print (param)
    total_num_of_param += param.numel()

print (f"total number of parameter=", total_num_of_param)

Parameter containing:
tensor([[ 0.5153,  0.5054,  0.3009],
        [ 0.3160, -0.3148, -0.5201]], requires_grad=True)
Parameter containing:
tensor([ 0.2187, -0.2698], requires_grad=True)
total number of parameter= 8


## 4. Define loss function

In [20]:
# recall that this is a regression problem
# so we have to use MSE
# you can code it by yourself - or you can use built-in pytorch module

criterion = nn.MSELoss()

In [21]:
targets

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.],
        [ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.],
        [ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])

In [24]:
mse = criterion (targets, outputs)
print (f"get mse = ",mse)
print (f"get mse value = ", mse.item())

get mse =  tensor(9202.0371, grad_fn=<MseLossBackward0>)
get mse value =  9202.037109375


## 5. Define our gradient descent algorithm

Recall we use the gradient descent algorithm  * w = w - alpha * gradient *
In fact, there are more, momentum, Adam --> adaptive learning rate

In [27]:
optimizer = torch.optim.SGD(some_random_layer.parameters(), lr = 0.0001, momentum = 0.9)

## Putting them together - actually learning!

basically same as linear / logistic regression

- 1. loop epochs
    - 2. loop the mini batch of samples (DataLoaders are using without-replacement)
        - Optional: put your data into GPU using .device() (if you have GPU)
        - 2.1 Predict
        - 2.2 Calculate loss
        - 2.3 Calculate gradients
        - 2.4 Update --> Backpropagate

    Print the samary of each epoch

In [37]:
# ## Putting them together - actually learning!

# basically same as linear / logistic regression
num_epochs = 5

# 1. loop epochs
for epoch in range (num_epochs):

    # 2. loop the mini batch of samples (DataLoaders are using without-replacement)
    for batch_x, batch_y in train_dl:
        
        # Optional: put your data into GPU using .device() (if you have GPU)
        batch_x.to (device)
        batch_y.to (device)
        
        # 2.1 Predict
        yhat = some_random_layer(batch_x)
        
        # 2.2 Calculate loss
        loss = criterion (yhat, batch_y)
        
        # 2.3 Calculate gradients
        optimizer .zero_grad() # clear all the gradients
        loss.backward() # this will calculate all the gradients inrespect to loss
        
        # print (some_random_layer.weight.grad)
        ## if most of the things are in 1000, your leanring rate should be around 0.0001
        
        # monitor the weight
        print ("Original: ", some_random_layer.weight)
        
        # 2.4 Update --> Backpropagate
        optimizer.step()
        print ("Updated: ", some_random_layer.weight)

    # Print the samary of each epoch
    print (f"Epoch: {epoch} | Loss: {loss:.2f}")

Original:  Parameter containing:
tensor([[-0.3935,  0.8477,  0.6812],
        [-0.3005,  0.7938,  0.9069]], requires_grad=True)
Updated:  Parameter containing:
tensor([[-0.4019,  0.8403,  0.6758],
        [-0.2973,  0.7963,  0.9084]], requires_grad=True)
Original:  Parameter containing:
tensor([[-0.4019,  0.8403,  0.6758],
        [-0.2973,  0.7963,  0.9084]], requires_grad=True)
Updated:  Parameter containing:
tensor([[-0.3986,  0.8436,  0.6786],
        [-0.2970,  0.7965,  0.9082]], requires_grad=True)
Original:  Parameter containing:
tensor([[-0.3986,  0.8436,  0.6786],
        [-0.2970,  0.7965,  0.9082]], requires_grad=True)
Updated:  Parameter containing:
tensor([[-0.3908,  0.8510,  0.6833],
        [-0.2982,  0.7951,  0.9072]], requires_grad=True)
Original:  Parameter containing:
tensor([[-0.3908,  0.8510,  0.6833],
        [-0.2982,  0.7951,  0.9072]], requires_grad=True)
Updated:  Parameter containing:
tensor([[-0.3952,  0.8472,  0.6801],
        [-0.2988,  0.7945,  0.9069]], 