# Getting Started

In [37]:
from __future__ import print_function
import torch
import torch.nn as nn

print(torch.__version__)

x = torch.empty(5, 3)
print(x)
print(x.int())
x = torch.zeros(5, 3, dtype=torch.short)
print(x)

1.2.0
tensor([[-7.5053e+25,  8.8282e-43, -7.5053e+25],
        [ 8.8282e-43, -7.5053e+25,  8.8282e-43],
        [-7.5053e+25,  8.8282e-43, -7.5053e+25],
        [ 8.8282e-43, -7.5053e+25,  8.8282e-43],
        [-7.5053e+25,  8.8282e-43, -7.5053e+25]])
tensor([[-2147483648,           0, -2147483648],
        [          0, -2147483648,           0],
        [-2147483648,           0, -2147483648],
        [          0, -2147483648,           0],
        [-2147483648,           0, -2147483648]], dtype=torch.int32)
tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]], dtype=torch.int16)


In [29]:
x = torch.randn(1)
print(x)
print(x.item())

# let us run this cell only if CUDA is available
# We will use ``torch.device`` objects to move tensors in and out of GPU
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!
else:
    print("No cuda")

torch.device

tensor([0.6041])
0.6041225790977478
tensor([1.6041], device='cuda:0')
tensor([1.6041], dtype=torch.float64)


torch.device

# AUTOGRAD: AUTOMATIC DIFFERENTIATION

In [30]:
x = torch.ones(2, 2, requires_grad=True)
print(x)
y = x + 2
print(y)
z = y * y * 3
out = z.sum()

print(z, out)
out.backward()
print(x.grad)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)
tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>) tensor(108., grad_fn=<SumBackward0>)
tensor([[18., 18.],
        [18., 18.]])


In [31]:
x = torch.randn(3, requires_grad=True)
print(x)
y = x * 2
while y.data.norm() < 1000:
    y = y * 2

y.sum().backward()
print(y)
print(x.grad)

tensor([ 0.1526,  0.0538, -0.6229], requires_grad=True)
tensor([  312.6021,   110.1364, -1275.7661], grad_fn=<MulBackward0>)
tensor([2048., 2048., 2048.])


# NEURAL NETWORKS

### CLASS: Conv2d
torch.nn.Conv2d (in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros')

##### Parameters
in_channels (int) – Number of channels in the input image

out_channels (int) – Number of channels produced by the convolution

kernel_size (int or tuple) – Size of the convolving kernel

stride (int or tuple, optional) – Stride of the convolution. Default: 1

padding (int or tuple, optional) – Zero-padding added to both sides of the input. Default: 0

padding_mode (string, optional) – zeros

dilation (int or tuple, optional) – Spacing between kernel elements. Default: 1

groups (int, optional) – Number of blocked connections from input channels to output channels. Default: 1

bias (bool, optional) – If True, adds a learnable bias to the output. Default: True

### CLASS: MaxPool2d
torch.nn.MaxPool2d(kernel_size, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False)

##### Parameters
kernel_size – the size of the window to take a max over

stride – the stride of the window. Default value is kernel_size

padding – implicit zero padding to be added on both sides

dilation – a parameter that controls the stride of elements in the window

return_indices – if True, will return the max indices along with the outputs. Useful for torch.nn.MaxUnpool2d later

ceil_mode – when True, will use ceil instead of floor to compute the output shape

### Next cell explained
#### conv output
output=(Width - filter+2*padding)/stride + 1 = width-filter+1 if stride==1 and padding==0

#### max_pool  output
output=(Width - dilation*(filter-1) + 2*padding -1 )/stride + 1 =(width-filter)/stride+1 if  dilation==1 and padding==0

#### The filter shifts is the stride
###### https://pytorch.org/docs/stable/nn.html#maxpool2d

- (32x32x1)  -> conv1 -> (32-3+1)     -> (30 x 30 x  6)       !conv filter=3  
- (30x30x6)  -> max1  -> (30-2)/2 + 1 -> (15 x 15 x  6)   !max_pool filter=2, stride=2
- (15x15x6)  -> conv2 -> (15-3+1)     -> (13 x 13 x 16)
- (13x13x16) -> max2  -> (13-2)/2 +1) -> ( 6 x 6  x 16)    !ceil_mode –false, use floor to compute the output shape
- (6x6x16)= 576 -> Linear -> 120
- 120 -> Linear -> 84
-  84 -> Linear -> output_dim 10

In [32]:
import torch.nn as nn
m = nn.MaxPool2d(2, stride=2)
c = nn.Conv2d(3,3,3)
input = torch.randn(1, 3, 32, 32)
o1  = c(input)
print('out1=',o1.size())
o2  = m(o1)
print('out2=',o2.size())
o3  = c(o2)
print('out3=',o3.size())
o4  = m(o3)
print('out4=',o4.size())

out1= torch.Size([1, 3, 30, 30])
out2= torch.Size([1, 3, 15, 15])
out3= torch.Size([1, 3, 13, 13])
out4= torch.Size([1, 3, 6, 6])


In [33]:
m = nn.ReLU()
#m = nn.LeakyReLU(0.1)
input = torch.randn(4,2)
print(input)
output = m(input)
print(output)
output.size()[1:]

tensor([[-1.0532,  1.9126],
        [ 0.0570,  1.0065],
        [ 0.6955, -0.3806],
        [-0.3456,  1.4938]])
tensor([[0.0000, 1.9126],
        [0.0570, 1.0065],
        [0.6955, 0.0000],
        [0.0000, 1.4938]])


torch.Size([2])

In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)         #in 1, out  6, kernel 3x3
        self.conv2 = nn.Conv2d(6, 16, 3)        #in 6, out 16, kernel 3x3
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)   # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

params = list(net.parameters())
#print(len(params))
#print(params[0].size())  # conv1's .weight

for i in range(len(params)):
    print(i,params[i].size())
#print(params[0])

input = torch.randn(1, 1, 32, 32) # 1 batch size, 1 input chanel , hight 32 , width 32

out = net(input)
print('out=',out)

net.zero_grad()
a = torch.randn(1,10)
print(a)
out.backward(a)


Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)
0 torch.Size([6, 1, 3, 3])
1 torch.Size([6])
2 torch.Size([16, 6, 3, 3])
3 torch.Size([16])
4 torch.Size([120, 576])
5 torch.Size([120])
6 torch.Size([84, 120])
7 torch.Size([84])
8 torch.Size([10, 84])
9 torch.Size([10])
out= tensor([[-0.0205,  0.0699, -0.0033,  0.0544,  0.1031,  0.0382,  0.1040,  0.0050,
          0.0070,  0.1053]], grad_fn=<AddmmBackward>)
tensor([[-0.9842,  0.5793,  0.3579,  0.6251,  0.2562, -0.4711,  2.3612,  0.1180,
         -0.8601,  0.0359]])


In [35]:
output = net(input)
target = torch.randn(10)  # a dummy target, for example
#print(target.shape)
target = target.view(1, -1)  # make it the same shape as output
#print(target.shape)

criterion = nn.MSELoss()
loss = criterion(output, target)
print(loss)
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

tensor(1.5181, grad_fn=<MseLossBackward>)
<MseLossBackward object at 0x00000276F625BAC8>
<AddmmBackward object at 0x00000276818CD390>
<AccumulateGrad object at 0x00000276F625BAC8>


In [36]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)
    
   
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([-0.0133, -0.0145,  0.0070,  0.0133,  0.0116,  0.0298])
