# Neural Network Architectures using PyTorch
Date: 2019-11-19  
Author: Jason Beach  
Categories: DataScience, DeepLearning  
Tags: pytorch, python, tag3  
<!--eofm-->

## Using and Creating Simple Artificial Neural Networks

In [47]:
import torch 
import torchvision
import torch.nn as nn

### Using a pre-trained model

In [48]:
# Download and load the pretrained ResNet-18
resnet = torchvision.models.resnet18(pretrained=True)

In [49]:
len(list(resnet.parameters()))

62

In [50]:
param = list(resnet.parameters())[0]

In [51]:
param.size()

torch.Size([64, 3, 7, 7])

In [52]:
# If you want to finetune only the top layer of the model, set as below.
for param in resnet.parameters():
    param.requires_grad = False

In [53]:
# Replace the top layer for finetuning.
resnet.fc = nn.Linear(resnet.fc.in_features, 100)  # 100 is an example.

# Forward pass.
images = torch.randn(64, 3, 224, 224)
outputs = resnet(images)
print (outputs.size())     # (64, 100)

torch.Size([64, 100])


### Save and load model

In [40]:
# Save and load the entire model
torch.save(resnet, 'model.ckpt')
model = torch.load('model.ckpt')

In [41]:
# Save and load only the model parameters (recommended)
torch.save(resnet.state_dict(), 'params.ckpt')
resnet.load_state_dict(torch.load('params.ckpt'))

<All keys matched successfully>

### Creating a simple perceptron

In [46]:
#create tensors
x = torch.tensor(1., requires_grad=True)
w = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)

In [47]:
#build a computational graph
y = w * x + b    # y = 2 * x + 3
print(y)

tensor(5., grad_fn=<AddBackward0>)


In [48]:
#compute gradients
y.backward()

#print out the gradients.
print(x.grad)    #x.grad = 2 
print(w.grad)    #w.grad = 1 
print(b.grad)    #b.grad = 1 

tensor(2.)
tensor(1.)
tensor(1.)


### Using linear regression

In [17]:
import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.nn import functional as F

x_data = Variable(torch.Tensor([[10.0], [9.0], [3.0], [2.0]] ))
y_data = Variable(torch.Tensor([[90.0], [80.0], [50.0], [30.0]]))

In [6]:
class LinearRegression(torch.nn.Module):    
    def __init__(self):
        super(LinearRegression, self).__init__()
        self.linear = torch.nn.Linear(1, 1)         #one indep var and makes one prediction for the Ŷ variable at a time   
    def forward(self, x):                           #forward pass refers to the calculation process of the output data from the input
        y_pred = self.linear(x)
        return y_pred  

In [7]:
model = LinearRegression()
criterion = torch.nn.MSELoss(size_average=False)    #loss function is calculated from the target y_data and the prediction y_pred in order to update weights
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)    #SGD optimizer for the update of hyperparameters



In [8]:
#use 20 single passes of training and weight updates
for epoch in range(20):
    model.train()
    if (epoch < 3 | epoch > 18):
        print( list(model.parameters()) )
    # Forward pass
    y_pred = model(x_data)    
    # Compute Loss
    loss = criterion(y_pred, y_data)    
    # Backward pass - learning and updating the weights
    loss.backward()
    optimizer.step()          #performs a parameter update based on the current gradient
    optimizer.zero_grad()     #every time a variable is back-propagated through, the gradient will be accumulated instead of being replaced   

[Parameter containing:
tensor([[-2.9363e+08]], requires_grad=True), Parameter containing:
tensor([-36516880.], requires_grad=True)]
[Parameter containing:
tensor([[8.6319e+08]], requires_grad=True), Parameter containing:
tensor([1.0735e+08], requires_grad=True)]
[Parameter containing:
tensor([[-2.5375e+09]], requires_grad=True), Parameter containing:
tensor([-3.1557e+08], requires_grad=True)]


In [9]:
#make predictions
new_x = Variable(torch.Tensor([[4.0]]))
y_pred = model(new_x)
print("predicted Y value: ", y_pred.data[0][0])

predicted Y value:  tensor(-9.0442e+10)


### Update to logistic regression

In [10]:
class LogisticRegression(torch.nn.Module):
     def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(1, 1)     
     def forward(self, x):
        y_pred = F.sigmoid(self.linear(x))          #<<< simple modification
        return y_pred

In [11]:
model = LogisticRegression()

In [12]:
criterion = torch.nn.BCELoss(size_average=True)     # used for binary output



In [13]:
#use 20 single passes of training and weight updates
for epoch in range(20):
    model.train()
    if (epoch < 3 | epoch > 18):
        print( list(model.parameters()) )
    # Forward pass
    y_pred = model(x_data)    
    # Compute Loss
    loss = criterion(y_pred, y_data)    
    # Backward pass - learning and updating the weights
    loss.backward()
    optimizer.step()          #performs a parameter update based on the current gradient
    optimizer.zero_grad()     #every time a variable is back-propagated through, the gradient will be accumulated instead of being replaced   

[Parameter containing:
tensor([[-0.9676]], requires_grad=True), Parameter containing:
tensor([0.8016], requires_grad=True)]
[Parameter containing:
tensor([[-0.9676]], requires_grad=True), Parameter containing:
tensor([0.8016], requires_grad=True)]
[Parameter containing:
tensor([[-0.9676]], requires_grad=True), Parameter containing:
tensor([0.8016], requires_grad=True)]




### Update to simple Artificial Neural Network

In [63]:
class Perceptron(torch.nn.Module):
    def __init__(self):
        super(Perceptron, self).__init__()
        self.linear = torch.nn.Linear(1, 1)
        self.relu = torch.nn.ReLU()
    def forward(self, x):
        y_pred = self.relu(self.linear(x))          #<<< simple mod instead of heavyside
        return y_pred

In [65]:
model = Perceptron()
criterion = torch.nn.MSELoss(size_average=False)    #loss function is calculated from the target y_data and the prediction y_pred in order to update weights
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)    #SGD optimizer for the update of hyperparameters

In [66]:
#use 20 single passes of training and weight updates
for epoch in range(20):
    model.train()
    if (epoch < 3 | epoch > 18):
        print( list(model.parameters()) )
    # Forward pass
    y_pred = model(x_data)    
    # Compute Loss
    loss = criterion(y_pred, y_data)    
    # Backward pass - learning and updating the weights
    loss.backward()
    optimizer.step()          #performs a parameter update based on the current gradient
    optimizer.zero_grad()     #every time a variable is back-propagated through, the gradient will be accumulated instead of being replaced   

[Parameter containing:
tensor([[-0.8250]], requires_grad=True), Parameter containing:
tensor([-0.1126], requires_grad=True)]
[Parameter containing:
tensor([[-0.8250]], requires_grad=True), Parameter containing:
tensor([-0.1126], requires_grad=True)]
[Parameter containing:
tensor([[-0.8250]], requires_grad=True), Parameter containing:
tensor([-0.1126], requires_grad=True)]


### Using feed-forward

In [183]:
import torch 

x = torch.ones(1, requires_grad=True)    #if False then gradient will not be calculated automatically
print(x.grad)                            #returns None bc scalar

tensor([1.], requires_grad=True)
None


In [185]:
x = torch.ones(1, requires_grad=True)
y = x + 2
z = y * y * 2
z.backward()     # automatically calculates the gradient
print(x.grad)    # ∂z/∂x = 12

tensor([12.])


### Update for a Feedforward Network

[ref: blog](https://medium.com/biaslyai/pytorch-introduction-to-neural-network-feedforward-neural-network-model-e7231cff47cb)

In [67]:
class Feedforward(torch.nn.Module):
        def __init__(self, input_size, hidden_size):
            super(Feedforward, self).__init__()
            self.input_size = input_size
            self.hidden_size  = hidden_size
            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
            self.relu = torch.nn.ReLU()
            self.fc2 = torch.nn.Linear(self.hidden_size, 1)
            self.sigmoid = torch.nn.Sigmoid()        
        def forward(self, x):
            hidden = self.fc1(x)
            relu = self.relu(hidden)
            output = self.fc2(relu)
            output = self.sigmoid(output)
            return output

In [73]:
import numpy
from sklearn.datasets import make_blobs

def blob_label(y, label, loc):
    target = numpy.copy(y)
    for l in loc:
        target[y == l] = label
        return target
    
x_train, y_train = make_blobs(n_samples=40, n_features=2, cluster_std=1.5, shuffle=True)
x_train = torch.FloatTensor(x_train)
y_train = torch.FloatTensor(blob_label(y_train, 0, [0]))
y_train = torch.FloatTensor(blob_label(y_train, 1, [1,2,3]))
x_test, y_test = make_blobs(n_samples=10, n_features=2, cluster_std=1.5, shuffle=True)
x_test = torch.FloatTensor(x_test)
y_test = torch.FloatTensor(blob_label(y_test, 0, [0]))
y_test = torch.FloatTensor(blob_label(y_test, 1, [1,2,3]))

In [74]:
model = Feedforward(2, 10)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)

In [75]:
model.eval()
y_pred = model(x_test)
before_train = criterion(y_pred.squeeze(), y_test)
print('Test loss before training' , before_train.item())

Test loss before training 0.467517226934433


In [77]:
model.train()
epoch = 20
for epoch in range(epoch):    
    optimizer.zero_grad()    # Forward pass
    y_pred = model(x_train)    # Compute Loss
    loss = criterion(y_pred.squeeze(), y_train)
   
    print('Epoch {}: train loss: {}'.format(epoch, loss.item()))    # Backward pass
    loss.backward()
    optimizer.step()

Epoch 0: train loss: 0.17611254751682281
Epoch 1: train loss: 0.1333077847957611
Epoch 2: train loss: 0.0991164818406105
Epoch 3: train loss: 0.07093816995620728
Epoch 4: train loss: 0.04720110446214676
Epoch 5: train loss: 0.02680397406220436
Epoch 6: train loss: 0.008982661180198193
Epoch 7: train loss: -0.006804706063121557
Epoch 8: train loss: -0.020925888791680336
Epoch 9: train loss: -0.03372335433959961
Epoch 10: train loss: -0.0454268679022789
Epoch 11: train loss: -0.056216198951005936
Epoch 12: train loss: -0.06623609364032745
Epoch 13: train loss: -0.07560508698225021
Epoch 14: train loss: -0.0844065323472023
Epoch 15: train loss: -0.0927170142531395
Epoch 16: train loss: -0.10059820115566254
Epoch 17: train loss: -0.10810460150241852
Epoch 18: train loss: -0.11527948081493378
Epoch 19: train loss: -0.122161865234375


In [78]:
model.eval()
y_pred = model(x_test)
after_train = criterion(y_pred.squeeze(), y_test) 
print('Test loss after Training' , after_train.item())

Test loss after Training 0.38520193099975586


# Understanding the PyTorch Implementation

In [6]:
# Code in file tensor/two_layer_net_numpy.py
import numpy as np

def run_sim_numpy():
    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random input and output data
    x = np.random.randn(N, D_in)
    y = np.random.randn(N, D_out)

    # Randomly initialize weights
    w1 = np.random.randn(D_in, H)
    w2 = np.random.randn(H, D_out)

    learning_rate = 1e-6
    for t in range(500):
        # Forward pass: compute predicted y
        h = x.dot(w1)
        h_relu = np.maximum(h, 0)
        y_pred = h_relu.dot(w2)
        
        # Compute and print loss
        loss = np.square(y_pred - y).sum()
        if t>498:
            print(t, loss)
        
        # Backprop to compute gradients of w1 and w2 with respect to loss
        grad_y_pred = 2.0 * (y_pred - y)
        grad_w2 = h_relu.T.dot(grad_y_pred)
        grad_h_relu = grad_y_pred.dot(w2.T)
        grad_h = grad_h_relu.copy()
        grad_h[h < 0] = 0
        grad_w1 = x.T.dot(grad_h)
        
        # Update weights
        w1 -= learning_rate * grad_w1
        w2 -= learning_rate * grad_w2

In [7]:
%timeit run_sim_numpy()

499 3.417254510017826e-07
499 6.80805478305819e-07
499 4.4091770001221044e-05
499 3.199749824397876e-05
499 4.691894222750476e-07
499 1.5314605068389221e-07
499 1.2785897363290369e-05
499 5.866297454492024e-05
412 ms ± 13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
def run_sim_torch():
    # Code in file tensor/two_layer_net_tensor.py
    import torch

    device = torch.device('cpu')
    # device = torch.device('cuda') # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random input and output data
    x = torch.randn(N, D_in, device=device)
    y = torch.randn(N, D_out, device=device)

    # Randomly initialize weights
    w1 = torch.randn(D_in, H, device=device)
    w2 = torch.randn(H, D_out, device=device)

    learning_rate = 1e-6
    for t in range(500):
        # Forward pass: compute predicted y
        h = x.mm(w1)
        h_relu = h.clamp(min=0)
        y_pred = h_relu.mm(w2)
        
        # Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
        # of shape (); we can get its value as a Python number with loss.item().
        loss = (y_pred - y).pow(2).sum()
        if t>498:
            print(t, loss.item())
        
        # Backprop to compute gradients of w1 and w2 with respect to loss
        grad_y_pred = 2.0 * (y_pred - y)
        grad_w2 = h_relu.t().mm(grad_y_pred)
        grad_h_relu = grad_y_pred.mm(w2.t())
        grad_h = grad_h_relu.clone()
        grad_h[h < 0] = 0
        grad_w1 = x.t().mm(grad_h)
        
        # Update weights using gradient descent
        w1 -= learning_rate * grad_w1
        w2 -= learning_rate * grad_w2

In [9]:
%timeit run_sim_torch()

499 1.6832038454595022e-05
499 9.516641148366034e-05
499 1.5797930245753378e-05
499 4.5778524508932605e-05
499 6.939189188415185e-05
499 0.00011741339403670281
499 0.00010538773494772613
499 2.471236075507477e-05
253 ms ± 17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
def run_sim_torchgrad():
    # Code in file autograd/two_layer_net_autograd.py
    import torch

    device = torch.device('cpu')
    # device = torch.device('cuda') # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random Tensors to hold input and outputs
    x = torch.randn(N, D_in, device=device)
    y = torch.randn(N, D_out, device=device)

    # Create random Tensors for weights; setting requires_grad=True means that we
    # want to compute gradients for these Tensors during the backward pass.
    w1 = torch.randn(D_in, H, device=device, requires_grad=True)
    w2 = torch.randn(H, D_out, device=device, requires_grad=True)

    learning_rate = 1e-6
    for t in range(500):
        # Forward pass: compute predicted y using operations on Tensors. Since w1 and
        # w2 have requires_grad=True, operations involving these Tensors will cause
        # PyTorch to build a computational graph, allowing automatic computation of
        # gradients. Since we are no longer implementing the backward pass by hand we
        # don't need to keep references to intermediate values.
        y_pred = x.mm(w1).clamp(min=0).mm(w2)
        
        # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
        # is a Python number giving its value.
        loss = (y_pred - y).pow(2).sum()
        if t>498:
            print(t, loss.item())
            
        # Use autograd to compute the backward pass. This call will compute the
        # gradient of loss with respect to all Tensors with requires_grad=True.
        # After this call w1.grad and w2.grad will be Tensors holding the gradient
        # of the loss with respect to w1 and w2 respectively.
        loss.backward()
        
        # Update weights using gradient descent. For this step we just want to mutate
        # the values of w1 and w2 in-place; we don't want to build up a computational
        # graph for the update steps, so we use the torch.no_grad() context manager
        # to prevent PyTorch from building a computational graph for the updates
        with torch.no_grad():
            w1 -= learning_rate * w1.grad
            w2 -= learning_rate * w2.grad
            
        # Manually zero the gradients after running the backward pass
        w1.grad.zero_()
        w2.grad.zero_()

In [14]:
%timeit run_sim_torchgrad()

499 4.717827687272802e-05
499 7.954805914778262e-05
499 2.859023334167432e-05
499 5.056699228589423e-05
499 8.617802814114839e-05
499 0.000970246153883636
499 5.565648098126985e-05
499 8.797551708994433e-05
977 ms ± 31.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
def run_sim_torchnn():
    # Code in file nn/two_layer_net_nn.py
    import torch

    device = torch.device('cpu')
    # device = torch.device('cuda') # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random Tensors to hold inputs and outputs
    x = torch.randn(N, D_in, device=device)
    y = torch.randn(N, D_out, device=device)

    # Use the nn package to define our model as a sequence of layers. nn.Sequential
    # is a Module which contains other Modules, and applies them in sequence to
    # produce its output. Each Linear Module computes output from input using a
    # linear function, and holds internal Tensors for its weight and bias.
    # After constructing the model we use the .to() method to move it to the
    # desired device.
    model = torch.nn.Sequential(
              torch.nn.Linear(D_in, H),
              torch.nn.ReLU(),
              torch.nn.Linear(H, D_out),
            ).to(device)

    # The nn package also contains definitions of popular loss functions; in this
    # case we will use Mean Squared Error (MSE) as our loss function. Setting
    # reduction='sum' means that we are computing the *sum* of squared errors rather
    # than the mean; this is for consistency with the examples above where we
    # manually compute the loss, but in practice it is more common to use mean
    # squared error as a loss by setting reduction='elementwise_mean'.
    loss_fn = torch.nn.MSELoss(reduction='sum')

    learning_rate = 1e-4
    for t in range(500):
        # Forward pass: compute predicted y by passing x to the model. Module objects
        # override the __call__ operator so you can call them like functions. When
        # doing so you pass a Tensor of input data to the Module and it produces
        # a Tensor of output data.
        y_pred = model(x)
        
        # Compute and print loss. We pass Tensors containing the predicted and true
        # values of y, and the loss function returns a Tensor containing the loss.
        loss = loss_fn(y_pred, y)
        if t>498:
            print(t, loss.item())
        
        # Zero the gradients before running the backward pass.
        model.zero_grad()
        
        # Backward pass: compute gradient of the loss with respect to all the learnable
        # parameters of the model. Internally, the parameters of each Module are stored
        # in Tensors with requires_grad=True, so this call will compute gradients for
        # all learnable parameters in the model.
        loss.backward()
        
        # Update the weights using gradient descent. Each parameter is a Tensor, so
        # we can access its data and gradients like we did before.
        with torch.no_grad():
            for param in model.parameters():
                param.data -= learning_rate * param.grad

In [16]:
%timeit run_sim_torchnn()

499 1.4216263934940798e-06
499 1.6250106682491605e-06
499 1.296600567002315e-05
499 7.413837010972202e-06
499 7.071726940921508e-06
499 1.4960786529627512e-06
499 3.874725280184066e-06
499 6.084675987949595e-06
1.14 s ± 44.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
def run_sim_torchoptim():
    # Code in file nn/two_layer_net_optim.py
    import torch

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random Tensors to hold inputs and outputs.
    x = torch.randn(N, D_in)
    y = torch.randn(N, D_out)

    # Use the nn package to define our model and loss function.
    model = torch.nn.Sequential(
              torch.nn.Linear(D_in, H),
              torch.nn.ReLU(),
              torch.nn.Linear(H, D_out),
            )
    loss_fn = torch.nn.MSELoss(reduction='sum')

    # Use the optim package to define an Optimizer that will update the weights of
    # the model for us. Here we will use Adam; the optim package contains many other
    # optimization algorithms. The first argument to the Adam constructor tells the
    # optimizer which Tensors it should update.
    learning_rate = 1e-4
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    for t in range(500):
        # Forward pass: compute predicted y by passing x to the model.
        y_pred = model(x)
        
        # Compute and print loss.
        loss = loss_fn(y_pred, y)
        if t>498:
            print(t, loss.item())
            
        # Before the backward pass, use the optimizer object to zero all of the
        # gradients for the Tensors it will update (which are the learnable weights
        # of the model)
        optimizer.zero_grad()
        
        # Backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        
        # Calling the step function on an Optimizer makes an update to its parameters
        optimizer.step()

In [21]:
%timeit run_sim_torchoptim()

499 3.5519506127457134e-06
499 2.89068813330573e-09
499 6.102735738977572e-08
499 5.387914003840422e-10
499 2.8562343601379325e-09
499 1.3440622836924376e-08
499 1.3865713022198634e-09
499 6.465485036244445e-09
1.7 s ± 96 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
def run_sim_torchModule():
    # Code in file nn/two_layer_net_module.py
    import torch

    class TwoLayerNet(torch.nn.Module):
        def __init__(self, D_in, H, D_out):
            """
            In the constructor we instantiate two nn.Linear modules and assign them as
            member variables.
            """
            super(TwoLayerNet, self).__init__()
            self.linear1 = torch.nn.Linear(D_in, H)
            self.linear2 = torch.nn.Linear(H, D_out)
        
        def forward(self, x):
            """
            In the forward function we accept a Tensor of input data and we must return
            a Tensor of output data. We can use Modules defined in the constructor as
            well as arbitrary (differentiable) operations on Tensors.
            """
            h_relu = self.linear1(x).clamp(min=0)
            y_pred = self.linear2(h_relu)
            return y_pred

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random Tensors to hold inputs and outputs
    x = torch.randn(N, D_in)
    y = torch.randn(N, D_out)

    # Construct our model by instantiating the class defined above.
    model = TwoLayerNet(D_in, H, D_out)

    # Construct our loss function and an Optimizer. The call to model.parameters()
    # in the SGD constructor will contain the learnable parameters of the two
    # nn.Linear modules which are members of the model.
    loss_fn = torch.nn.MSELoss(reduction='sum')
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
    for t in range(500):
        # Forward pass: Compute predicted y by passing x to the model
        y_pred = model(x)
        
        # Compute and print loss
        loss = loss_fn(y_pred, y)
        if t>498:
            print(t, loss.item())
        
        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [23]:
%timeit run_sim_torchModule

16.5 ns ± 0.32 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


## PyTorch API and Implementation

### Components

The following are component of the API, and explained in-detail in the [docs](https://pytorch.org/docs/stable/nn.html#)

* Linear layers - `nn.Linear`
* Dropout layers - `nn.Dropout`
* BatchNorm - `nn.BatchNorm1d`
* Convolution layers - `nn.Conv1d`
* Pooling layers - `n.MaxPool1d`
* Padding layers - `ReflectionPad1d`


In [93]:
m = nn.Linear(in_features=20, out_features=30, bias=True)
input = torch.randn(10, 20)
output = m(input)
print(output.size())

torch.Size([10, 30])


In [91]:
input = torch.randn(3, 2, 4)     #(samples, rows, columns/features)
print(input.dim()); print(input.shape)
print(input)
m = nn.Conv1d(in_channels=2, out_channels=1, kernel_size=1, stride=2)
output = m(input)
print(output.dim()); print(output.shape)
print(output)

3
torch.Size([3, 2, 4])
tensor([[[-0.4115, -1.7695,  1.0313, -0.4850],
         [ 0.4450,  1.0767,  0.7671,  1.1860]],

        [[-0.6997, -1.2741,  0.5958,  0.6612],
         [ 1.6600, -0.3820, -0.6644,  1.3638]],

        [[ 1.8785, -0.5167,  1.2430,  1.5727],
         [ 0.9379, -1.5978,  1.2981,  0.9275]]])
3
torch.Size([3, 1, 2])
tensor([[[-0.2894, -0.1731]],

        [[-1.0687,  0.5802]],

        [[-0.0940, -0.4418]]], grad_fn=<SqueezeBackward1>)


In [106]:
m = nn.Dropout(p=0.2)    #p-probability of element being zero
input = torch.randn(1, 5)
print(input)
m(input)

tensor([[-0.3767,  0.1609,  0.0140,  1.2529,  1.0479]])


tensor([[-0.4709,  0.2012,  0.0176,  1.5661,  1.3098]])

In [110]:
input = torch.randn(3, 1, 5)
m = nn.BatchNorm1d(num_features=1)
m(input)

tensor([[[ 1.2611, -0.6936,  0.9268,  0.5002, -0.5748]],

        [[ 1.5226,  0.5925, -0.2946, -0.7670,  1.4274]],

        [[-0.6122,  0.1138, -0.0378, -1.8526, -1.5118]]],
       grad_fn=<NativeBatchNormBackward>)

In [112]:
m = nn.MaxPool1d(3, stride=2)    #pool of size=3, stride=2
input = torch.randn(3, 1, 5)
m(input)

tensor([[[ 0.5099,  0.5099]],

        [[ 1.1373,  0.1945]],

        [[ 0.9555, -0.3029]]])

In [113]:
m = nn.ReflectionPad1d(2)
input = torch.arange(8, dtype=torch.float).reshape(1, 2, 4)
print(input)
print( m(input) )

tensor([[[0., 1., 2., 3.],
         [4., 5., 6., 7.]]])
tensor([[[2., 1., 0., 1., 2., 3., 2., 1.],
         [6., 5., 4., 5., 6., 7., 6., 5.]]])


### Application

In [None]:
import torch
import torch.nn.functional as F

In [None]:
# replace following class code with an easy sequential network
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer

    def forward(self, x):
        x = F.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x

In [None]:
net1 = Net(1, 10, 1)

In [None]:
# easy and fast way to build your network
net2 = torch.nn.Sequential(
    torch.nn.Linear(1, 10),
    torch.nn.ReLU(),
    torch.nn.Linear(10, 1)
)

In [None]:
print(net1)     # net1 architecture
print(net2)     # net2 architecture

Linear layers expect the input to be represented in a 1-dimensional form. Thus we include a call to the “view” function, which converts the input from the 2- dimensional input into 1-dimension.  Note, PyTorch can still train in mini-batch mode. The view function converts the input tensor into the dimensions `[n,1,1,3072]`, where n is the mini-batch size.

After passing data through our network we will have an output tensor of size `[n,1,1,10]`.

### Use case: basic feedforward

In [None]:
samples, sample_rate = librosa.load(file_path)
  for ts in [0.75 ,1 ,1.25]: 
        for ps in [ −1 ,0 ,+1]:
            samples new = librosa.effects.time_stretch(samples, rate=ts)
            y_new = librosa.effects.pitch_shift(samples_new, sample_rate, n_steps=ps)

max_length = 1.5 # Max length in seconds
samples, sample rate = librosa.load(file_path)
short_samples = librosa.util.fix_length(samples, sample_rate * max_length )
melSpectrum = librosa.feature.melspectrogram(short_samples.astype(np.float16), sr=sample_rate , n_mels=128)
logMelSpectrogram = librosa.power_to_db(melSpectrum, ref=np.max )

In [None]:
import torch.nn as nn
# PyTorch Network Definition
class Model(nn.Module): 
    def init (self):
        def super(Model, self).init() 
        self.fc1 = nn.Linear(3072, 128) 
        self.fc2 = nn.Linear(128, 128) 
        self.fc3 = nn.Linear(128, 10)
    def forward(self, x):
        x = x.view((−1, 3072)) # Converts 2D data to 1D 
        h = self.fc1(x)
        h=torch.relu(h)
        h = self.fc2(h) 
        h=torch.relu(h)
        h = self.fc3(h)
        out = torch.log softmax(h,dim=1) 
        return out

### Use case: regularization

With deep learning we have multiple layers of computation with hidden values that are passed to subsequent layers. The output of each of these layers is likely to be a non-normalized input, and the distribution is likely to change frequently during the training process. This process is commonly referred to as “internal covariate shift.” Batch normalization [IS15] aims to reduce internal covariate shift in a network by normalizing the outputs of intermediate layers during training. This speeds the training process and allows for higher learning rates without risking divergence.

In [None]:
#the same using regularization techniques
class Model(nn.Module): 
    def init (self):
        super(Model, self).init() 
        self.fc1 = nn.Linear(3072, 128) 
        self.bc1 = nn.BatchNorm1d(128)   #Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputs with optional additional channel dimension) 
        self.fc2 = nn.Linear(128, 128) 
        self.bc2 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(128, 10)
    def forward(self, x):
        x = x.view((−1, 3072))
        h = self.fc1(x)
        h=self.bc1(h)
        h=torch.relu(h)
        h = F.dropout(h, p=0.5, training=self.training) #Disabled during evaluation
        h = self.fc2(h)
        h = self.bc2(h)
        h = torch.relu(h)
        h = F.dropout(h, p=0.2, training=self.training) #Disabled during evaluation
        h = self.fc3(h)
        out = torch.log_softmax(h,dim=1) 
        return out

In [None]:
model = Model( )
model.train()

optimizer = optim.Adam(model.parameters() , lr=0.01) 
n_epoch = 40
for epoch in range(n epoch):
    for data , target in train loader : 
        # Get Samples
        if use cuda:
            data , target = data . cuda () , target . cuda ()
            # Clear gradients
            optimizer . zero grad () # Forward Propagation
            y pred = model(data) # Error Computation
            loss = torch.cross entropy(y pred , target) # Backpropagation
            loss.backward ()
            # Parameter Update
            optimizer.step()     

### Use case: autoencoder

This autoencoder learns a low-dimensional encoding of the input data that the decoder is able to produce examples.  The output of our network must be the same size as our input, d = 3072, thus the final layer of our network must ensure that the dimensionality matches the input.

In [None]:
import torch.nn as nn
import torch .nn. functional as F # In place operations for non−linearities

# PyTorch Network Definition
class autoencoder(nn.Module):
    def init (self):
        super(autoencoder, self).__init__()
        self.e_fc1 = nn.Linear(3072, 512)
        self.e_fc2 = nn.Linear(512, 128)
        self.e_fc3 = nn.Linear(128, 64)
        self.e_fc4 = nn.Linear(64,64)
        self.d_fc1 = nn.Linear(64, 64)
        self.d_fc2 = nn.Linear(64, 128)
        self.d_fc3 = nn.Linear(128, 512)
        self.d_fc4 = nn.Linear(512, 3072)
    def forward(self, x):
        # Encoder
        h = F.relu(self.e_fc1(x))
        h = F.relu(self.e_fc2(h))
        h = F.relu(self.e_fc3(h))
        h = self.e_fc4(h)
        # Decoder
        h = F.relu(self.d_fc1(h))
        h = F.relu(self.d_fc2(h)) 
        h = F.relu(self.d_fc3(h)) 
        h = self.d_fc4(h)
        out = F.tanh(h) 
        return out

When examining the reconstructed inputs, we notice that they appear to be less sharp than the examples shown in Fig. 4.28. This is mainly due to the MSE loss function. Because it is computing the squared error, it tends to pull all values toward the mean prioritizing the average over specific areas of the input.

In [None]:
import torch.optim as optim 
import torch.nn.functional as F
# Neural Network Training in PyTorch
model = autoencoder() 
optimizer = optim.Adam(model.parameters() , lr=learning rate , weight_decay=1e−5)    #decay for regularization
for epoch in range(n epoch):
    for data, _ in train loader :
        # Get samples
        input = data.view(−1,3072)    # We will reuse the formatted input as our target 
        # Forward Propagation
        output = model(input)     # Error Computation
        loss = F.mse_loss(output, input)     # Clear gradients
        optimizer.zero_grad()     # Backpropagation
        loss.backward()
        # Parameter Update
        optimizer.step()

## Advanced Deep Learning Models

### Convolutional Neural Network

* [ref: stride](https://www.quora.com/What-does-stride-mean-in-the-context-of-convolutional-neural-networks)
* [ref: pooling](https://www.quora.com/What-is-max-pooling-in-convolutional-neural-networks)
* [ref: convolutional](https://adventuresinmachinelearning.com/convolutional-neural-networks-tutorial-in-pytorch/)
* [ref: tutorials point](https://www.tutorialspoint.com/pytorch/pytorch_convolutional_neural_network.htm)

### Long-Short Term Memory Model

* [ref: math](https://medium.com/@aidangomez/let-s-do-this-f9b699de31d9)

#### Comparison of LSTM Numpy (by-hand) and PyTorch

Create by-hand using numpy, [ref](https://towardsdatascience.com/the-lstm-reference-card-6163ca98ae87)

In [79]:
#by-hand
import numpy as np 
from scipy.special import expit as sigmoid

def forget_gate(x, h, Weights_hf, Bias_hf, Weights_xf, Bias_xf, prev_cell_state):
    forget_hidden  = np.dot(Weights_hf, h) + Bias_hf
    forget_eventx  = np.dot(Weights_xf, x) + Bias_xf
    return np.multiply( sigmoid(forget_hidden + forget_eventx), prev_cell_state )

def input_gate(x, h, Weights_hi, Bias_hi, Weights_xi, Bias_xi, Weights_hl, Bias_hl, Weights_xl, Bias_xl):
    ignore_hidden  = np.dot(Weights_hi, h) + Bias_hi
    ignore_eventx  = np.dot(Weights_xi, x) + Bias_xi
    learn_hidden   = np.dot(Weights_hl, h) + Bias_hl
    learn_eventx   = np.dot(Weights_xl, x) + Bias_xl
    return np.multiply( sigmoid(ignore_eventx + ignore_hidden), np.tanh(learn_eventx + learn_hidden) )


def cell_state(forget_gate_output, input_gate_output):
    return forget_gate_output + input_gate_output

  
def output_gate(x, h, Weights_ho, Bias_ho, Weights_xo, Bias_xo, cell_state):
    out_hidden = np.dot(Weights_ho, h) + Bias_ho
    out_eventx = np.dot(Weights_xo, x) + Bias_xo
    return np.multiply( sigmoid(out_eventx + out_hidden), np.tanh(cell_state) )

In [80]:
#seed final, fully-connected linear layer
#set Parameters for a small LSTM network
input_size  = 2 # size of one 'event', or sample, in our batch of data
hidden_dim  = 3 # 3 cells in the LSTM layer
output_size = 1 # desired model output

def model_output(lstm_output, fc_Weight, fc_Bias):
  '''Takes the LSTM output and transforms it to our desired 
  output size using a final, fully connected layer'''
  return np.dot(fc_Weight, lstm_output) + fc_Bias

Create a PyTorch LSTM with the same parameters. PyTorch will automatically assign the weights with random values — we’ll extract those and use them to initialize our NumPy network as well.

In [82]:
import torch
from torch import nn
 
#Initialize an PyTorch LSTM for comparison to our Numpy LSTM
class LSTM(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers=1):
        super(LSTM, self).__init__()
        self.hidden_dim=hidden_dim
        #LSTM Layer
        self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, batch_first=True)
        #Final, fully-connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x, hidden):
        batch_size = 1
        # get LSTM outputs
        lstm_output, (h,c) = self.lstm(x, hidden)
        # shape output to be (batch_size*seq_length, hidden_dim)
        lstm_output = lstm_output.view(-1, self.hidden_dim)  
        # get final output 
        model_output = self.fc(lstm_output)
        return model_output, (h,c)
    
torch.manual_seed(5)
torch_lstm = LSTM(input_size = input_size, hidden_dim = hidden_dim, output_size = output_size,)

In [83]:
#extract and use weights with by-hand version
state = torch_lstm.state_dict()
print(state)

OrderedDict([('lstm.weight_ih_l0', tensor([[ 0.3813, -0.4317],
        [ 0.4705,  0.3694],
        [ 0.4851, -0.4427],
        [-0.3875,  0.2747],
        [-0.5389,  0.5706],
        [ 0.1229,  0.0746],
        [-0.4937,  0.1840],
        [ 0.2483,  0.0916],
        [ 0.5553,  0.1734],
        [-0.5120,  0.4851],
        [ 0.1960, -0.2754],
        [-0.5303,  0.3291]])), ('lstm.weight_hh_l0', tensor([[ 0.5487, -0.4730,  0.0316],
        [ 0.2071, -0.2726, -0.1263],
        [-0.3855, -0.2730, -0.5264],
        [-0.0134,  0.3423,  0.2808],
        [ 0.5424, -0.5071, -0.0710],
        [ 0.5621,  0.0945, -0.1628],
        [-0.5200,  0.2687,  0.4383],
        [ 0.4630,  0.4833,  0.1130],
        [ 0.4115, -0.1453,  0.4689],
        [-0.0494, -0.1191, -0.2870],
        [ 0.3074,  0.2336,  0.3672],
        [-0.3690, -0.3070,  0.5464]])), ('lstm.bias_ih_l0', tensor([-0.3205, -0.3293, -0.1545, -0.1866, -0.3926,  0.4666,  0.0644,  0.2632,
         0.4282, -0.3741,  0.4407, -0.2892])), ('lstm.bia

The PyTorch documentation explains all we need to break this down:
* The weights for each gate in are in this order: ignore, forget, learn, output
* keys with ‘ih’ in the name are the weights/biases for the input, or Wx_ and Bx_
* keys with ‘hh’ in the name are the weights/biases for the hidden state, or Wh_ and Bh_

Given the parameters we chose, we can therefore extract the weights for the NumPy LSTM to use in this way:

In [84]:
#Event (x) Weights and Biases for all gates
Weights_xi = state['lstm.weight_ih_l0'][0:3].numpy() # shape [h, x]
Weights_xf = state['lstm.weight_ih_l0'][3:6].numpy() # shape [h, x]
Weights_xl = state['lstm.weight_ih_l0'][6:9].numpy() # shape [h, x]
Weights_xo = state['lstm.weight_ih_l0'][9:12].numpy() # shape [h, x]
 
Bias_xi = state['lstm.bias_ih_l0'][0:3].numpy() #shape is [h, 1]
Bias_xf = state['lstm.bias_ih_l0'][3:6].numpy() #shape is [h, 1]
Bias_xl = state['lstm.bias_ih_l0'][6:9].numpy() #shape is [h, 1]
Bias_xo = state['lstm.bias_ih_l0'][9:12].numpy() #shape is [h, 1]
 
#Hidden state (h) Weights and Biases for all gates
Weights_hi = state['lstm.weight_hh_l0'][0:3].numpy() #shape is [h, h]
Weights_hf = state['lstm.weight_hh_l0'][3:6].numpy() #shape is [h, h]
Weights_hl = state['lstm.weight_hh_l0'][6:9].numpy() #shape is [h, h]
Weights_ho = state['lstm.weight_hh_l0'][9:12].numpy() #shape is [h, h]
 
Bias_hi = state['lstm.bias_hh_l0'][0:3].numpy() #shape is [h, 1]
Bias_hf = state['lstm.bias_hh_l0'][3:6].numpy() #shape is [h, 1]
Bias_hl = state['lstm.bias_hh_l0'][6:9].numpy() #shape is [h, 1]
Bias_ho = state['lstm.bias_hh_l0'][9:12].numpy() #shape is [h, 1]
 
#--------------------------------------------------------------------
# Final, fully connected layer Weights and Bias
fc_Weight = state['fc.weight'][0].numpy() #shape is [h, output_size]
fc_Bias = state['fc.bias'][0].numpy() #shape is [,output_size]

Now, we have two networks — one in PyTorch, one in NumPy — with access to the same starting weights. We’ll put some time series data through each to ensure they are identical. To do a forward pass with our network, we’ll pass the data into the LSTM gates in sequence, and print the output after each event:

In [86]:
#Simple Time Series Data
data = np.array(
[[1,1],
[2,2],
[3,3]])
 
#Initialize cell and hidden states with zeroes
h = np.zeros(hidden_dim)
c = np.zeros(hidden_dim)
 
#Loop through data, updating the hidden and cell states after each pass
for eventx in data:
    f = forget_gate(eventx, h, Weights_hf, Bias_hf, Weights_xf, Bias_xf, c)
    i = input_gate(eventx, h, Weights_hi, Bias_hi, Weights_xi, Bias_xi, Weights_hl, Bias_hl, Weights_xl, Bias_xl)
    c = cell_state(f,i)
    h = output_gate(eventx, h, Weights_ho, Bias_ho, Weights_xo, Bias_xo, c)
    print(model_output(h, fc_Weight, fc_Bias))

-0.3479427319173535
-0.47396493597023465
-0.5263107365187176


In [87]:
#rPyTorch expects an extra dimension for batch size:
torch_batch = torch.Tensor(data).unsqueeze(0)
 
torch_output, (torch_hidden, torch_cell) = torch_lstm(torch_batch, None)
print(torch_output)

tensor([[-0.3479],
        [-0.4740],
        [-0.5263]], grad_fn=<AddmmBackward>)


We can additionally verify that after the data has gone through the LSTM cells, the two models have the same hidden and cell states:

In [88]:
print('\n','-'*40)
print(f'Torch Hidden State: {torch_hidden}')
print(f'Torch Cell State: {torch_cell}\n')
print(f'np Hidden State: {h}')
print(f'np Cell State: {c}')


 ----------------------------------------
Torch Hidden State: tensor([[[-0.1190,  0.4759,  0.3252]]], grad_fn=<StackBackward>)
Torch Cell State: tensor([[[-0.3556,  1.1789,  1.3026]]], grad_fn=<StackBackward>)

np Hidden State: [-0.11898849  0.47585365  0.32522364]
np Cell State: [-0.3555854   1.17887101  1.3025983 ]


### Transformer Model

### Q-Learning / Reinforcement Learning

## Popular Architectures

### ELMo

### Universal Sentence Encoder (USE)

### BERT

### XLNet

## Specialized Networks

### Autoencoders

### Self-Organizing Models

### Conditional Random Field