# In-class exercise 7: Deep Learning 1 (Part B)
In this notebook we will gain some hands-on experience with backpropagation

In [295]:
import numpy as np
import matplotlib.pyplot as plt
import time

%matplotlib inline

from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

In [296]:
X, y = load_digits(return_X_y=True)
# Convert labels into one-hot format
Y = label_binarize(y, classes=np.unique(y))
K = Y.shape[1]  # number of classes
D = X.shape[1]  # number of features

np.random.seed(123)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

Check shapes

In [297]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
print(X_train[0])

(1257, 64) (1257, 10) (540, 64) (540, 10)
[ 0.  1. 10. 16. 16.  8.  0.  0.  0. 10. 16. 13. 16. 12.  0.  0.  0.  1.
  3.  3. 16.  9.  0.  0.  0.  0.  0. 13. 14.  1.  0.  0.  0.  0.  2. 16.
 16. 12.  3.  0.  0.  0.  0.  5. 11. 16. 11.  0.  0.  0.  2.  7. 14. 16.
  6.  0.  0.  0. 11. 16. 13.  5.  0.  0.]


# 1. Simple backpropagation example

Addition of two vectors

In [298]:
## the d_out means the gradient of the loss function with respect to the output of the layer

class Add:
    def forward(self, x, y):
        # TODO
        z = x + y
        return z
    def backward(self, d_out):
        # TODO
        dx = d_out
        dy = d_out
        return dx, dy

Element-wise multiplication of two vectors

In [299]:
class Multiply:
    def forward(self, x, y):
        # TODO
        self.x = x
        self.y = y
        z = x * y
        return z
      
    def backward(self, d_out):
        # TODO
        dx = d_out * self.y
        dy = d_out * self.x
        return dx, dy

Sum of a vector

In [300]:
class Sum:
    def forward(self, x):
        # TODO
        z = np.sum(x)    
        return z
    def backward(self, d_out):
        # TODO
        return d_out

### Dot product of two vectors as composition of multiplication and summation

Dot product:
$$\mathbf{x}\cdot\mathbf{y} = \sum_{i=0}^{n}x_i y_i$$

In [301]:
x = np.arange(1, 5, dtype=np.float32)
y = np.arange(-1, 3, dtype=np.float32)

In [302]:
print(x)
print(y)

[1. 2. 3. 4.]
[-1.  0.  1.  2.]


In [303]:
mult = Multiply()
vec_sum = Sum()

w = mult.forward(x, y)
z = vec_sum.forward(w)

d_w = vec_sum.backward(1.0)
d_x, d_y = mult.backward(d_w)

In [304]:
z, d_x, d_y

(10.0,
 array([-1.,  0.,  1.,  2.], dtype=float32),
 array([1., 2., 3., 4.], dtype=float32))

### Dot product of two vectors as one operation

Dot product of two vectors

In [305]:
class DotProduct:
    def forward(self, x, y):
        # TODO
        self.x = x
        self.y = y
        z = np.dot(x, y)
        return z
    def backward(self, d_out):
        # TODO
        dx = d_out * self.y
        dy = d_out * self.x
        return dx, dy

In [306]:
x = np.arange(1, 5, dtype=np.float32)
y = np.arange(-1, 3, dtype=np.float32)

In [307]:
dp = DotProduct()
z = dp.forward(x, y)
d_x, d_y = dp.backward(1.0)

In [308]:
z, d_x, d_y

(10.0,
 array([-1.,  0.,  1.,  2.], dtype=float32),
 array([1., 2., 3., 4.], dtype=float32))

**Lessons:**
1. By implementing `forward` and `backward` method we can compute gradients of an arbitrary composition of functions
2. Use `cache` to store values that will be needed in the backward 
3. Multiple operations can be combined into a single module (i.e. function)
4.  `1.0` as `d_out` for the terminal node in our computational graph

# 2. Multi-class logistic regression (without backprop)

Multi-class logistic regression model

Data:
* Data matrix $\mathbf{X} \in \mathbb{R}^{N \times D}$.
* Target labels in one-hot format $\mathbf{Y} \in \mathbb{R}^{N \times K}$.
$Y_{nk} = 1$ if sample $n$ belongs to class $k$, $Y_{nk} = 0$ otherwise.

Model parameters:
* Weight matrix $\mathbf{W} \in \mathbb{R}^{D \times K}$.
* Bias vector $\mathbf{b} \in \mathbb{R}^{K}$.

Making predictions with the model:
* Logits 
$$\mathbf{a}_n = \mathbf{W x}_n + \mathbf{b}$$
* Denote the matrix of logits as 
$$\mathbf{A} = \mathbf{XW} +  \mathbf{1}_N \mathbf{b}^T \in \mathbb{R}^{N \times K}$$
* Convert logits to probabilities using softmax function
$$p(Y_{nk} = 1 \mid \mathbf{x}_n, \mathbf{W}, \mathbf{b}) = \frac{\exp(A_{nk})}{\sum_{c = 1}^{K} \exp(A_{nc})}$$

Negative log-likelihood


\begin{align}
-\log p(\mathbf{Y} \mid \mathbf{X}, \mathbf{W}, \mathbf{b}) &= - \frac{1}{N}\sum_{n=1}^{N} \sum_{k=1}^{K} Y_{nk} \log p(Y_{nk} = 1 \mid \mathbf{x}_n, \mathbf{W}, \mathbf{b})\\
&= \frac{1}{N} \sum_{n=1}^{N} \sum_{k=1}^{K} Y_{nk} \left(-A_{nk} + \log \left( \sum_{c=1}^{C} \exp(A_{nc}) \right) \right)\\
%&= \frac{1}{N} \sum_{n=1}^{N} \left(\sum_{k=1}^{K} -Y_{nk} A_{nk} \right) + \log \left( \sum_{c=1}^{C} \exp(A_{nc}) \right)
\end{align}


In [309]:
from scipy.special import softmax

In [310]:
def predict(X, W, b):
    """Generate predictions for a multi-class logistic regression model.

    Args:
        X: data matrix, shape (N, D)
        W: weight matrix, shape (D, K)
        b: bias vector, shape (K)

    Returns:
        Y_pred: Predicted class probabilities, shape (N, K).
            Y_pred[n, k] = probability that sample n belongs to class k.
    """
    # TODO
    logist = np.dot(X, W) + b
    Y_pred = softmax(logist, axis=1)
   
    return Y_pred
    

Negative log-likelihood of multiclass logistic regression 

In [311]:
def nll_loss(X, W, b, Y):
    """Compute negative log-likelihood of a logistic regression model.

    Also known as categorical cross entropy loss.

    Args:
        X: data matrix, shape (N, D)
        W: weight matrix, shape (D, K)
        b: bias vector, shape (K)
        Y: true labels in one-hot format, shape (N, K)

    Returns:
        loss: loss of the logistic regression model, shape ()
    """
    e = 1e-9 # for numerical stability
    loss = -np.sum(Y * np.log(predict(X, W, b) + e)) / X.shape[0]
    
    return loss

In [312]:
def nll_grad(X, W, b, Y):
    """Compute gradient of the NLL loss w.r.t. W and b.

    Args:
        X: data matrix, shape (N, D)
        W: weight matrix, shape (D, K)
        b: bias vector, shape (K)
        Y: true labels in one-hot format, shape (N, K)

    Returns:
        d_W: gradient of the los w.r.t. W, shape (D, K)
        d_b: gradient of the los w.r.t. b, shape (K)
    """
    # TODO
    
    p = predict(X, W, b)
    
    d_W = np.dot(X.T, p - Y) / x.shape[0]
    d_b = np.sum(p - Y, axis=0) / x.shape[0]
    
    return d_W, d_b
    
    

In [313]:
# Initialize learnable model parameters
W = np.zeros([D, K])
b = np.zeros([K])

In [314]:
# Specify optimization parameters
learning_rate = 1e-2
max_epochs = 250
report_frequency = 25

In [315]:
for epoch in range(max_epochs):
    # Compute train loss
    # TODO
    train_loss = nll_loss(X_train, W, b, Y_train)
    
    # Print train loss every `report_frequency` epochs
    # TODO
    
    if epoch % report_frequency == 0:
        print(f'Epoch {epoch}, loss: {train_loss}')

    # Perform the update
    # TODO
    d_W, d_b = nll_grad(X_train, W, b, Y_train)
    W -= learning_rate * d_W
    b -= learning_rate * d_b
    

Epoch 0, loss: 2.302585082994045
Epoch 25, loss: 1.4892111715536795
Epoch 50, loss: 0.6537324967243424
Epoch 75, loss: 0.34199151086644314
Epoch 100, loss: 0.20548972104264995


Epoch 125, loss: 0.16044226924290544
Epoch 150, loss: 0.1108155967836138
Epoch 175, loss: 0.06649041011266495
Epoch 200, loss: 0.03642813028326075
Epoch 225, loss: 1.7924880261156382e-05


In [316]:
# Compute test loss
# TODO
test_loss = nll_loss(X_test, W, b, Y_test)

# Compute test accuracy
# TODO
test_accuracy = accuracy_score(Y_test.argmax(axis=1), predict(X_test, W, b).argmax(axis=1))

# Print test loss and accuracy
# TODO
print(f'Test loss: {test_loss}')
print(f'Test accuracy: {test_accuracy}')

Test loss: 0.9024807872890447
Test accuracy: 0.9537037037037037


# 3. Multi-class logistic regression (with backprop)

In [318]:
import numpy as np

In [319]:
class Linear:
    def __init__(self, input_dim, output_dim):
        self.W = np.random.randn(input_dim, output_dim)
        self.b = np.random.randn(output_dim)
        
    def forward(self, x):
        self.x = x
        z = np.dot(x, self.W) + self.b
        return z
    
    def backward(self, d_out):
        d_x = np.dot(d_out, self.W.T)
        d_W = np.dot(self.x.T, d_out)
        d_b = np.sum(d_out, axis=0)
        return d_x, d_W, d_b
    

In [320]:
class CategoricalCrossEntropy:
    def forward(self, x, y):
        self.x = x
        self.y = y
        e = 1e-9
        loss = -np.sum(y * np.log(x + e)) / x.shape[0]
        return loss
    
    def backward(self):
        d_x = -self.y / (self.x + 1e-9) / self.x.shape[0]
        return d_x

In [321]:
class LogisticRegression:
    """Logistic regression model.

    Gradients are computed with backpropagation.
    """

    def __init__(self, num_features, num_classes, learning_rate=1e-2):
        # Initialize hyperparameters
        # TODO
        self.learning_rate = learning_rate
        self.num_features = num_features
        self.num_classes = num_classes

        # Initialize the model parameters
        # TODO
        self.W = np.zeros([num_features, num_classes])
        self.b = np.zeros([num_classes])
        self.losses = []
            
        # Define layers
        # TODO
        self.dot_product = DotProduct()
        self.linear = Linear(num_features, num_classes)
        
        # Define loss
        # TODO
        self.loss = CategoricalCrossEntropy()

    def predict(self, X):
        """Generate predictions for one minibatch.

        Args:
            X: data matrix, shape (N, D)

        Returns:
            Y_pred: predicted class probabilities, shape (N, D)
            Y_pred[n, k] = probability that sample n belongs to class k
        """
        # TODO        
        logist = self.linear.forward(X)
        Y_pred = softmax(logist, axis=1)
        return Y_pred
        
    def step(self, X, Y):
        """Perform one step of gradient descent on the minibatch of data."""
        # Forward  - compute the loss on training data
        # TODO
        loss = self.loss.forward(self.predict(X), Y)
        self.losses.append(loss)
        
        # Backward  - compute the gradients of loss w.r.t. all the model parameters
        # TODO
        d_x, d_W, d_b = self.linear.backward(d_x)
        
        # Apply the gradients
        # TODO
        
        self.W -= self.learning_rate * d_W
        self.b -= self.learning_rate * d_b
    

In [322]:
# Specify optimization parameters
learning_rate = 1e-2
max_epochs = 301
report_frequency = 25

In [323]:
log_reg = LogisticRegression(num_features=D, num_classes=K, learning_rate=learning_rate)

In [325]:
for epoch in range(max_epochs):
    # Perform one step of gradient descent
    # TODO
    log_reg.step(X_train, Y_train)

    # Print train loss every `report_frequency` epochs
    # TODO
    if epoch % report_frequency == 0:
        print(f'Epoch {epoch}, loss: {train_loss}')

UnboundLocalError: cannot access local variable 'd_x' where it is not associated with a value

In [None]:
# Compute test loss
# TODO

# Compute test accuracy
# TODO

# Print test loss and accuracy
# TODO