In [2]:
import torch
import numpy as np

In [3]:
torch.__version__

'2.1.0'

In [4]:
device = torch.device('cpu')
device

device(type='cpu')

# Let's start

<img src='../ANN/japan.png'/>

In [5]:
'''
m = 5, n = 3(Temp, Rainfall, Humidity)
y = yield of Apples, Orange
'''

'\nm = 5, n = 3(Temp, Rainfall, Humidity)\ny = yield of Apples, Orange\n'

## 1: Specify X and y

In [6]:
# Input (temp, rainfall, humidity)
X_train = np.array([[73, 67, 43], [91, 88, 64], [87, 134, 58], 
                   [102, 43, 37], [69, 96, 70], [73, 67, 43], 
                   [91, 88, 64], [87, 134, 58], [102, 43, 37], 
                   [69, 96, 70], [73, 67, 43], [91, 88, 64], 
                   [87, 134, 58], [102, 43, 37], [69, 96, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
Y_train = np.array([[56, 70], [81, 101], [119, 133], 
                    [22, 37], [103, 119], [56, 70], 
                    [81, 101], [119, 133], [22, 37], 
                    [103, 119], [56, 70], [81, 101], 
                    [119, 133], [22, 37], [103, 119]], 
                   dtype='float32')

In [7]:
# have to convert numpy to torch tensors
# torch tensors are simply numpy version of pytorch
inputs  = torch.from_numpy(X_train)
targets = torch.from_numpy(Y_train)

print(type(inputs))
print(type(targets))

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [8]:
inputs.shape, targets.shape
inputs.size(), targets.size()

(torch.Size([15, 3]), torch.Size([15, 2]))

## 2. DataLoaders

Remember in ML - we have to do the batch learning ourselve, like mini-batch, stochastic batch. 
Pytorch has a class called DataLoaders that automatically do this for you.

It's optional whether you wanna use it or just use it - don't reinvent the wheel.

In [9]:
# Define a Dataset so that dataloaders understand

from torch.utils.data import TensorDataset

train_dataset = TensorDataset(inputs, targets)

train_dataset[0]
# get the 3 samples from the dataset
# train_dataset[0:3]

(tensor([73., 67., 43.]), tensor([56., 70.]))

In [10]:
from torch.utils.data import DataLoader

batch_size = 3 # for no reason - here I am using a mini-batch of size 3
train_dl = DataLoader(train_dataset, batch_size, shuffle = True, num_workers = 4)

In [11]:
for x,y in train_dl:
    print("----new batch----")
    print(x) # this will give you "batch size " of x, e.g. 3 sets of x
    print(y) # this will give you "batch size " of y, e.g. 3 sets of y

----new batch----
tensor([[102.,  43.,  37.],
        [ 69.,  96.,  70.],
        [ 87., 134.,  58.]])
tensor([[ 22.,  37.],
        [103., 119.],
        [119., 133.]])
----new batch----
tensor([[ 91.,  88.,  64.],
        [ 73.,  67.,  43.],
        [ 87., 134.,  58.]])
tensor([[ 81., 101.],
        [ 56.,  70.],
        [119., 133.]])
----new batch----
tensor([[ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.]])
tensor([[ 81., 101.],
        [119., 133.],
        [ 22.,  37.]])
----new batch----
tensor([[69., 96., 70.],
        [73., 67., 43.],
        [91., 88., 64.]])
tensor([[103., 119.],
        [ 56.,  70.],
        [ 81., 101.]])
----new batch----
tensor([[ 73.,  67.,  43.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 22.,  37.],
        [103., 119.]])


## 3. Define some layers

Don't be confuse about "Layers". it's simply matrix multiplication. It's true until now.

In [12]:
import torch.nn as nn # nn = neural nework

In [13]:
# define model - linear regression
some_random_layer = nn.Linear(3,2) # 3 = incoming features, 2 = output features

In [14]:
print(some_random_layer.weight) # matrix - randomly initialize --> using standardization methods.
print(some_random_layer.weight.shape)
print(some_random_layer.bias)
print(some_random_layer.bias.shape)

Parameter containing:
tensor([[ 0.0513, -0.2395,  0.5170],
        [-0.0656, -0.1019, -0.3464]], requires_grad=True)
torch.Size([2, 3])
Parameter containing:
tensor([ 0.0084, -0.4997], requires_grad=True)
torch.Size([2])


Note: output features is 2. So, bias has 2.

In [15]:
# predict 

outputs = some_random_layer(inputs)
# input = (15,3) @ (3,2) = (15, 2)
outputs.shape

torch.Size([15, 2])

In [16]:
# how do chatgpt count their number of parameters - 175b parameters

total_num_of_params = 0
for param in some_random_layer.parameters():
    print(param)
    # total_num_of_params+=

Parameter containing:
tensor([[ 0.0513, -0.2395,  0.5170],
        [-0.0656, -0.1019, -0.3464]], requires_grad=True)
Parameter containing:
tensor([ 0.0084, -0.4997], requires_grad=True)


print(param)

    Parameter containing: [the first parameters, w = 6]
    tensor([[ 0.5407,  0.1850, -0.3178],
            [ 0.1833,  0.2263,  0.1200]], requires_grad=True) 
    
    Parameter containing: [the second parameters, bias = 2]
    tensor([-0.2637,  0.3207], requires_grad=True)

In [17]:
for param in some_random_layer.parameters():
    print(param.numel())

6
2


In [18]:
total_num_of_params = 0
for param in some_random_layer.parameters():
    total_num_of_params += param.numel()
    
print(total_num_of_params)

8


## 4. Define Loss function

In [21]:
# recall that this is regression problem
# so we have to use MSE
# you can code by yourself - or you can cuse built-in pytorch module

criterion = nn.MSELoss()

In [22]:
mse = criterion (targets, outputs)

In [24]:
print(mse)
print(mse.item())

tensor(11465.2617, grad_fn=<MseLossBackward0>)
11465.26171875


## 5. Define our gradient descent algorithm

Recall we use the gradient descent algorithm w = w- alpha * gradient

In fact, there are more, momentum, Adam --> adaptive learning rate

In [26]:
optimizer = torch.optim.SGD(some_random_layer.parameters(), lr = 0.0001, momentum=0.9)

## 6. Putting them together - actually Learning

In [31]:
# basically same as Linear and Logistic Regeression
num_epoch = 5
# 1. loop epoch
for epoch in range(num_epoch):

    # 2. loop the mini batch of samples (Dataloaders are using without-replacement)
    for batch_x, batch_y in train_dl:

        #optional: put your data into GPU using .device()
        
        # 2.1 Predict
        yhat = some_random_layer(batch_x)
        
        # 2.2 Calculate Loss
        loss = criterion(yhat,batch_y)
        
        # 2.3 Calculate Gradients
        optimizer.zero_grad() # clear all the gradient
        loss.backward() # this will calculate all the gradient in respect to loss
        
        # print the gradient
        # print(some_random_layer.weight.grad)
        # if most of the thing are in 1000, lr should be around 0.0001.
        # print the old weight
        # print(f"Original w: {some_random_layer.weight:.3f}")
        
        # 2.4 Update! --> Backpropagate
        optimizer.step()
        
        # print updated weight
        # print(f"Original w: {some_random_layer.weight:.3f}")
        
    # print the summary of each epoch
    print(f"Epoch: {epoch} | Loss: {loss:2f}")


tensor([[-1043.6965, -1454.0029,  -813.3240],
        [-4466.4971, -6062.3994, -3464.6118]])
tensor([[ 3144.7061,  3688.3086,  1952.3918],
        [15881.3496, 18912.5332,  9895.5957]])
tensor([[1892.9827, 2048.0273, 1317.4324],
        [8493.6543, 9415.7656, 5896.7383]])
tensor([[ -2432.1868,  -2473.5774,  -1777.0806],
        [-12372.4014, -12311.7236,  -8849.4268]])
tensor([[ -2446.6692,  -2205.6978,  -1676.7224],
        [-11635.2441, -10338.7871,  -7872.5747]])
Epoch: 0 | Loss: 13086.882812
tensor([[ -556.9272,  -930.1807,  -416.9375],
        [-1669.1648, -2973.3335, -1227.3447]])
tensor([[ 5730.7666,  5081.5713,  3186.3804],
        [26425.4648, 23417.2910, 14635.0010]])
tensor([[  -73.6984,  -154.0888,  -115.4095],
        [-1466.7733, -1893.8452, -1369.2241]])
tensor([[ -3157.6460,  -3124.6079,  -2246.1567],
        [-13864.6104, -14006.6562, -10034.7520]])
tensor([[ -2791.9802,  -3587.3052,  -1720.7606],
        [-11698.4189, -15605.0293,  -7379.4551]])
Epoch: 1 | Loss: 16743