# Gated recurrent unit (GRU)

Gated recurrent units (GRUs) are a gating mechanism in recurrent neural networks, introduced in 2014 by Kyunghyun Cho et al.

The GRU is like a long short-term memory (LSTM) with a gating mechanism to input or forget certain features, but lacks a context vector or output gate, resulting in fewer parameters than LSTM

Some documentation
- [Wikipedia - GRUs](https://en.wikipedia.org/wiki/Gated_recurrent_unit)
- [Pytorch - RNN](https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)
- [Pytorch - GRU](https://pytorch.org/docs/stable/generated/torch.nn.GRU.html)


# Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Device 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model
In this application the RNN we can view the images from the MNIST dataset (28x28) as 28 time sequences with 28 features. Of course, normally we wouldn use RNN with images, there are way better architectures for this image related tasks.

In [2]:
# Hyperparameter
input_size = 28
sequence_length = 28
num_layers = 2
hidden_size = 256
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 2
    
# Create the model
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size*sequence_length, num_classes)
        
    def forward(self, x):
        # initialization of hidden states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) # (num_layers, N_mini_batches, hidden_size)
        # forward prop
        out, _ = self.gru(x, h0) # here we dont store the hidden state, because every example has its own hidden state
        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)
        return out

# Load Data
- MNIST: 28x28 pixels
- When we load the dataset, the shape will be (batch_size, 1, 28, 28)

In [7]:
train_dataset = datasets.MNIST(root='dataset/', train=True, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = datasets.MNIST(root='dataset/', train=False, transform=transforms.ToTensor(), download=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

# Training

In [4]:
# Intialize NN
model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the NN
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Get data to device
        data = data.to(device=device).squeeze(1) #remove the dimention 1 in (Nx1x28x28)
        targets = targets.to(device=device)

        # Forward propagation
        scores = model(data)
        loss = criterion(scores, targets)

        # Backward propagation
        optimizer.zero_grad() # initialize all gradients to zero for each batch
        loss.backward()

        # Gradient descent or Adam step
        optimizer.step()

# Performance

In [5]:
# Check accuracy on training and test sets
def check_accuracy(loader, model):
    if loader.dataset.train:
        print("Checking accuracy on training data")
    else:
        print("Checking accuracy on test data")
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device).squeeze(1)
            y = y.to(device=device)

            scores = model(x)
            _, predictions = scores.max(1)  # scores is 64x10 and we want to know which one of those the is the maximum value, so in max: dim=1
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        print(f'got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
    model.train()

In [6]:
check_accuracy(train_loader, model)
check_accuracy(test_loader, model)

Checking accuracy on training data
got 59305 / 60000 with accuracy 98.84
Checking accuracy on test data
got 9863 / 10000 with accuracy 98.63
