# DataLoader

In [4]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import time
import numpy as np
from npRnn import npRnn
import cpp as cpp

seq_length = 28
input_size = 28
hidden_size = 128
num_layers = 1
num_classes = 10
batch_size = 1
num_epochs = 1
learning_rate = 0.01
start_time = 0

data_path = "../../data/mnist"
train_dataset = torchvision.datasets.MNIST(root=data_path, train=True, transform=transforms.ToTensor(), download=True)
test_dataset = torchvision.datasets.MNIST(root=data_path, train=False, transform=transforms.ToTensor())

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Initialize

In [5]:
def xavier_init(c1, c2, w=1, h=1, fc=False):
    fan_1 = c2 * w * h
    fan_2 = c1 * w * h
    ratio = np.sqrt(6.0 / (fan_1 + fan_2))
    params = ratio * (2 * np.random.random((c1, c2, w, h)) - 1)
    if fc:
        params = params.reshape(c1, c2)
    return params
    
np_U = xavier_init(hidden_size, input_size, fc=True)
np_W = xavier_init(hidden_size, hidden_size, fc=True)
np_V = xavier_init(hidden_size, hidden_size, fc=True)
np_FC_W = xavier_init(num_classes, hidden_size, fc=True)
U = cpp.cppTensor(np_U.reshape(hidden_size, input_size))
W = cpp.cppTensor(np_W.reshape(hidden_size, hidden_size))
V = cpp.cppTensor(np_V.reshape(hidden_size, hidden_size))
FC_W = cpp.cppTensor(np_FC_W)

total_step = len(train_loader)
interval = 5000

np_iter_loss = 0
cpu_iter_loss = 0
gpu_iter_loss = 0

# Numpy Rnn Train

In [6]:
print("start train numpy hidden_size {}".format(hidden_size))

np_model = npRnn(learning_rate, seq_length, hidden_size, num_classes, np_U, np_W, np_V, np_FC_W)
start_time = time.time()
for epoch in range(num_epochs):
    for i, (train_images, train_labels) in enumerate(train_loader):
        np_images = train_images.reshape(seq_length, batch_size, input_size).detach().numpy()
        np_labels = train_labels.detach().numpy()
        np_hprev = np.zeros((hidden_size, 1))

        np_outputs = np_model.forward(np_images, np_hprev)
        np_Y, np_loss = np_model.cross_entropy_loss(np_outputs, np_labels)
        np_dY = np_model.deriv_softmax(np_Y, np_labels)
        np_gradients = np_model.backward(np_dY)
        np_model.optimizer_step(np_gradients)
        np_iter_loss += np.sum(np_loss)

        if (i + 1) % interval == 0:
            print("numpy epoch {}/{} iter {}/{} loss {:.4f}".format(epoch + 1, num_epochs, i + 1, total_step, np_iter_loss / interval))
            print("elased time {}".format(time.time() - start_time))
            start_time = time.time()
            np_iter_loss = 0

start train numpy hidden_size 128
numpy epoch 1/1 iter 5000/60000 loss 1.5735
elased time 12.266170740127563
numpy epoch 1/1 iter 10000/60000 loss 1.1819
elased time 13.246779441833496
numpy epoch 1/1 iter 15000/60000 loss 0.9659
elased time 13.423335552215576
numpy epoch 1/1 iter 20000/60000 loss 0.7735
elased time 13.36319351196289
numpy epoch 1/1 iter 25000/60000 loss 0.7409
elased time 13.136044263839722
numpy epoch 1/1 iter 30000/60000 loss 0.6307
elased time 13.6442551612854
numpy epoch 1/1 iter 35000/60000 loss 0.6360
elased time 13.36297082901001
numpy epoch 1/1 iter 40000/60000 loss 0.5868
elased time 13.253770112991333
numpy epoch 1/1 iter 45000/60000 loss 0.6033
elased time 13.687143087387085
numpy epoch 1/1 iter 50000/60000 loss 0.5741
elased time 13.667681694030762
numpy epoch 1/1 iter 55000/60000 loss 0.5442
elased time 13.810617685317993
numpy epoch 1/1 iter 60000/60000 loss 0.5481
elased time 13.560202598571777


# CPU Rnn Train

In [7]:
print("start train numpy hidden_size {}".format(hidden_size))

cpu_model = cpp.cppRnn(learning_rate, U, W, V, FC_W, seq_length, input_size, hidden_size, num_classes)
start_time = time.time()
for epoch in range(num_epochs):
    for i, (train_images, train_labels) in enumerate(train_loader):
        train_images = train_images.reshape(seq_length, batch_size, input_size).detach().numpy()
        train_labels = train_labels.detach().numpy()
        train_hprev = np.zeros((hidden_size, 1))

        cpu_images = [cpp.cppTensor(train_images[j]) for j in range(len(train_images))]
        cpu_hprev = cpp.cppTensor(train_hprev)
        cpu_labels = cpp.cppTensor(train_labels)

        cpu_outputs = cpp.cppTensor(np.zeros((num_classes, 1)))
        cpu_Y = cpp.cppTensor(np.zeros((num_classes, 1)))
        cpu_dY = cpp.cppTensor(np.zeros((num_classes, 1)))
        cpu_loss = cpp.cppTensor(np.zeros((num_classes, 1)))

        cpu_model.forward(cpu_outputs, cpu_images, cpu_hprev)
        cpu_model.cross_entropy_loss(cpu_dY, cpu_Y, cpu_loss, cpu_outputs, cpu_labels)
        cpu_model.backward(cpu_dY)
        cpu_model.optimizer()

        cpu_iter_loss += np.sum(cpu_loss.numpy())

        if (i + 1) % interval == 0:
            print("cpu epoch {}/{} iter {}/{} loss {:.4f}".format(epoch + 1, num_epochs, i + 1, total_step, cpu_iter_loss / interval))
            print("elased time {}".format(time.time() - start_time))
            start_time = time.time()
            cpu_iter_loss = 0


start train numpy hidden_size 128
cpu epoch 1/1 iter 5000/60000 loss 1.5981
elased time 23.97615647315979
cpu epoch 1/1 iter 10000/60000 loss 0.9997
elased time 23.99291706085205
cpu epoch 1/1 iter 15000/60000 loss 0.7095
elased time 24.202476978302002
cpu epoch 1/1 iter 20000/60000 loss 0.6657
elased time 24.817394256591797
cpu epoch 1/1 iter 25000/60000 loss 0.6062
elased time 24.52069640159607
cpu epoch 1/1 iter 30000/60000 loss 0.5609
elased time 23.417757511138916
cpu epoch 1/1 iter 35000/60000 loss 0.5040
elased time 23.6754629611969
cpu epoch 1/1 iter 40000/60000 loss 0.5008
elased time 23.760572910308838
cpu epoch 1/1 iter 45000/60000 loss 0.5100
elased time 23.97665572166443
cpu epoch 1/1 iter 50000/60000 loss 0.4457
elased time 24.099708557128906
cpu epoch 1/1 iter 55000/60000 loss 0.4729
elased time 23.84420609474182
cpu epoch 1/1 iter 60000/60000 loss 0.4747
elased time 23.813793182373047


# GPU Rnn Train

In [8]:
gpu_model = cpp.cppRnn(learning_rate, U, W, V, FC_W, seq_length, input_size, hidden_size, num_classes)
gpu_model.cuda()
print("start train gpu hidden_size {}".format(hidden_size))
start_time = time.time()
for epoch in range(num_epochs):
    for i, (train_images, train_labels) in enumerate(train_loader):
        train_images = train_images.reshape(seq_length, batch_size, input_size).detach().numpy()
        train_labels = train_labels.detach().numpy()
        train_hprev = np.zeros((hidden_size, 1))

        gpu_images = [cpp.cppTensor(train_images[j]) for j in range(len(train_images))]
        gpu_hprev = cpp.cppTensor(train_hprev)
        gpu_labels = cpp.cppTensor(train_labels)

        gpu_outputs = cpp.cppTensor(np.zeros((num_classes, 1)))
        gpu_Y = cpp.cppTensor(np.zeros((num_classes, 1)))
        gpu_dY = cpp.cppTensor(np.zeros((num_classes, 1)))
        gpu_loss = cpp.cppTensor(np.zeros((num_classes, 1)))

        [gpu_images[j].cuda() for j in range(len(gpu_images))]
        gpu_hprev.cuda()
        gpu_labels.cuda()
        
        gpu_outputs.cuda()
        gpu_Y.cuda()
        gpu_dY.cuda()
        gpu_loss.cuda()

        gpu_model.forward(gpu_outputs, gpu_images, gpu_hprev)
        gpu_model.cross_entropy_loss(gpu_dY, gpu_Y, gpu_loss, gpu_outputs, gpu_labels)
        gpu_model.backward(gpu_dY)
        gpu_model.optimizer()
        
        gpu_loss.cpu()
        gpu_iter_loss += np.sum(gpu_loss.numpy())

        if (i + 1) % interval == 0:
            print("gpu epoch {}/{} iter {}/{} loss {:.4f}".format(epoch + 1, num_epochs, i + 1, total_step, gpu_iter_loss / interval))
            print("elased time {}".format(time.time() - start_time))
            start_time = time.time()
            gpu_iter_loss = 0

start train gpu hidden_size 128
gpu epoch 1/1 iter 5000/60000 loss 1.2363
elased time 29.537625312805176
gpu epoch 1/1 iter 10000/60000 loss 0.8144
elased time 29.47009825706482
gpu epoch 1/1 iter 15000/60000 loss 0.6604
elased time 29.434409141540527
gpu epoch 1/1 iter 20000/60000 loss 0.5663
elased time 29.59179663658142
gpu epoch 1/1 iter 25000/60000 loss 0.5250
elased time 29.469311475753784
gpu epoch 1/1 iter 30000/60000 loss 0.5004
elased time 29.455142736434937
gpu epoch 1/1 iter 35000/60000 loss 0.4616
elased time 29.459095239639282
gpu epoch 1/1 iter 40000/60000 loss 0.4368
elased time 29.667620182037354
gpu epoch 1/1 iter 45000/60000 loss 0.4261
elased time 29.827187299728394
gpu epoch 1/1 iter 50000/60000 loss 0.4058
elased time 29.469065189361572
gpu epoch 1/1 iter 55000/60000 loss 0.3760
elased time 29.450764417648315
gpu epoch 1/1 iter 60000/60000 loss 0.3743
elased time 29.455721139907837


# Validation

In [9]:
np_correct = 0
np_total = 0
cpu_correct = 0
cpu_total = 0
gpu_correct = 0
gpu_total = 0

def softmax(x):
    e = np.exp(x)
    return e / np.sum(e)

def predict(outputs):
    return np.argmax(softmax(outputs), 0)

gpu_model.cpu()

for test_images, test_labels in test_loader:
    np_images = test_images.reshape(seq_length, batch_size, input_size).detach().numpy()
    np_hprev = np.zeros((hidden_size, 1))
    labels = test_labels.detach().numpy()

    cpu_images = [cpp.cppTensor(np_images[j]) for j in range(len(np_images))]
    cpu_hprev = cpp.cppTensor(np_hprev)
    cpu_outputs = cpp.cppTensor(np.zeros((num_classes, 1)))

    gpu_images = [cpp.cppTensor(np_images[j]) for j in range(len(np_images))]
    gpu_hprev = cpp.cppTensor(np_hprev)
    gpu_outputs = cpp.cppTensor(np.zeros((num_classes, 1)))

    np_outputs = np_model.forward(np_images, np_hprev)
    np_pred = predict(np_outputs)

    np_total += labels.shape[0]
    np_correct += (np_pred == labels).sum().item()

    cpu_model.forward(cpu_outputs, cpu_images, cpu_hprev)
    cpu_pred = predict(cpu_outputs.numpy())

    cpu_total += labels.shape[0]
    cpu_correct += (cpu_pred == labels).sum().item()
    
    gpu_model.forward(gpu_outputs, gpu_images, gpu_hprev)
    gpu_pred = predict(gpu_outputs.numpy())

    gpu_total += labels.shape[0]
    gpu_correct += (gpu_pred == labels).sum().item()

print('np Accuracy of the model on the 10000 test images: {} %'.format(100 * np_correct / np_total))
print('cpu Accuracy of the model on the 10000 test images: {} %'.format(100 * cpu_correct / cpu_total))
print('gpu Accuracy of the model on the 10000 test images: {} %'.format(100 * gpu_correct / gpu_total))

np Accuracy of the model on the 10000 test images: 84.17 %
cpu Accuracy of the model on the 10000 test images: 88.21 %
gpu Accuracy of the model on the 10000 test images: 88.96 %


# Numpy LSTM

In [5]:
class My_LSTM(object):
    def __init__(self, x_size, hidden_size, num_classes):
        self.lr = learning_rate
        self.seq_length = seq_length
        self.input_size = x_size + hidden_size
        
        self.W_f = xavier_init(hidden_size, self.input_size, fc=True)
        self.b_f = np.zeros((hidden_size, 1))
        
        self.W_i = xavier_init(hidden_size, self.input_size, fc=True)
        self.b_i = np.zeros((hidden_size, 1))
        
        self.W_g = xavier_init(hidden_size, self.input_size, fc=True)
        self.b_g = np.zeros((hidden_size, 1))
        
        self.W_o = xavier_init(hidden_size, self.input_size, fc=True)
        self.b_o = np.zeros((hidden_size, 1))
        
        self.W_fc = xavier_init(num_classes, hidden_size, fc=True)
        self.b_fc = np.zeros((num_classes, 1))
        
        self.mW_f = np.zeros_like(self.W_f)
        self.mb_f = np.zeros_like(self.b_f)
        
        self.mW_i = np.zeros_like(self.W_i)
        self.mb_i = np.zeros_like(self.b_i)
        
        self.mW_g = np.zeros_like(self.W_g)
        self.mb_g = np.zeros_like(self.b_g)
        
        self.mW_o = np.zeros_like(self.W_o)
        self.mb_o = np.zeros_like(self.b_o)
        
        self.mW_fc = np.zeros_like(self.W_fc)
        self.mb_fc = np.zeros_like(self.b_fc)
        
        self.X = {}
        self.F = {}
        self.F_A = {}
        
        self.I = {}
        self.I_A = {}
        
        self.G = {}
        self.G_A = {}
        
        self.O = {}
        self.O_A = {}
        
        self.C = {}
        self.C_A = {}
        self.H = {}
        
    def forward(self, x, hprev, cprev):
        self.X = {}
        self.F = {}
        self.F_A = {}
        
        self.I = {}
        self.I_A = {}
        
        self.G = {}
        self.G_A = {}
        
        self.O = {}
        self.O_A = {}
        
        self.C = {}
        self.C_A = {}
        self.H = {}
        
        self.H[-1] = np.copy(hprev)
        self.C[-1] = np.copy(cprev)
        
        for t in range(self.seq_length):
            self.X[t] = np.concatenate((self.H[t-1], x[t].T), axis = 0)
            
            self.F[t] = self.W_f @ self.X[t] + self.b_f
            self.F_A[t] = self.sigmoid(self.F[t])
            
            self.I[t] = self.W_i @ self.X[t] + self.b_i
            self.I_A[t] = self.sigmoid(self.I[t])
            
            self.G[t] = self.W_g @ self.X[t] + self.b_g
            self.G_A[t] = np.tanh(self.G[t])
            
            self.C[t] = self.F_A[t] * self.C[t - 1] + self.I_A[t] * self.G_A[t]
            self.C_A[t] = np.tanh(self.C[t])
            
            self.O[t] = self.W_o @ self.X[t] + self.b_o
            self.O_A[t] = self.sigmoid(self.O[t])
            
            self.H[t] = self.O_A[t] * self.C_A[t]
            
        output = self.W_fc @ self.H[self.seq_length - 1] + self.b_fc
        
        return output
    
    def backward(self, dY):
        dW_f, db_f = np.zeros_like(self.W_f), np.zeros_like(self.b_f)
        dW_i, db_i = np.zeros_like(self.W_i), np.zeros_like(self.b_i)
        dW_g, db_g = np.zeros_like(self.W_g), np.zeros_like(self.b_g)
        dW_o, db_o = np.zeros_like(self.W_o), np.zeros_like(self.b_o)
        dW_fc, db_fc = np.zeros_like(self.W_fc), np.zeros_like(self.b_fc)
        
        dH_next = np.zeros_like(self.H[0])
        dC_next = np.zeros_like(self.C[0])
        
        dW_fc = dY @ self.H[self.seq_length - 1].T
        db_fc = dY
        
        for t in reversed(range(self.seq_length)):
            dh = self.W_fc.T @ dY + dH_next
            
            dO_A = dh * self.C_A[t]
            dO = dO_A * (self.O_A[t] * (1 - self.O_A[t]))
            dW_o += dO @ self.X[t].T
            db_o += dO
            
            dC_A = self.O_A[t] * dh
            dC = dC_A * (1 - self.C_A[t] ** 2) + dC_next
            
            dF_A = dC * self.C[t - 1]
            dI_A = dC * self.G_A[t]
            dG_A = self.I_A[t] * dC
            dC_next = self.F_A[t] * dC
            
            dF = dF_A * (self.F_A[t] * (1 - self.F_A[t]))
            dW_f += dF @ self.X[t].T
            db_f += dF
            
            dI = dI_A * (self.I_A[t] * (1 - self.I_A[t]))
            dW_i += dI @ self.X[t].T
            db_i += dI
            
            dG = dG_A * (1 - self.G_A[t] ** 2)
            dW_g += dG @ self.X[t].T
            db_g += dG
            
            dX = self.W_f.T @ dF + self.W_i.T @ dI + self.W_g.T @ dG + self.W_o.T @ dO
            dH_next = dX[:hidden_size, :]
        
        gradients = [dW_f, db_f, dW_i, db_i, dW_g, db_g, dW_o, db_o, dW_fc, db_fc]
        
        return gradients
    
    def optimizer_step(self, gradients):
        for dparam in gradients:
            np.clip(dparam, -5, 5, out=dparam)
        
        for param, dparam, mem in zip(
            [self.W_f, self.b_f, self.W_i, self.b_i, self.W_g, self.b_g, self.W_o, self.b_o, self.W_fc, self.b_fc],
            gradients,
            [self.mW_f, self.mb_f, self.mW_i, self.mb_i, self.mW_g, self.mb_g, self.mW_o, self.mb_o, self.mW_fc, self.mb_fc]):
            mem += dparam * dparam
            param += -self.lr * dparam / np.sqrt(mem + 1e-8)
            
    def cross_entropy_loss(self, outputs, labels):
        Y = self.softmax(outputs)
        loss = -np.log(Y) * self.one_hot_vector(Y, labels)
        return Y, loss
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def softmax(self, x):
        e = np.exp(x)
        return e / np.sum(e)
    
    def deriv_softmax(self, Y, labels):
        dY = np.copy(Y)
        for i in range(len(labels)):
            dY[labels[i]][i] -= 1
        return dY
    
    def one_hot_vector(self, Y, labels):
        out = np.zeros_like(Y)
        for i in range(len(labels)):
            out[labels[i]][i] = 1
        return out
    
    def predict(self, outputs):
        return np.argmax(self.softmax(outputs), 0)

In [6]:
model = My_LSTM(input_size, hidden_size, num_classes)

total_step = len(train_loader)
iter_loss = 0
interval = 10000
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.reshape(seq_length, batch_size, input_size).detach().numpy()
        labels = labels.detach().numpy()
        
        hprev = np.zeros((hidden_size, 1))
        cprev = np.zeros((hidden_size, 1))
        outputs = model.forward(images, hprev, cprev)
        Y, loss = model.cross_entropy_loss(outputs, labels)
        gradients = model.backward(model.deriv_softmax(Y, labels))
        model.optimizer_step(gradients)
        iter_loss += np.sum(loss)
        if(i + 1) % interval == 0:
            print("epoch {}/{} iter {}/{} loss {:.4f}".format(epoch + 1, num_epochs, i + 1, total_step, iter_loss / interval))
            iter_loss = 0

epoch 1/2 iter 10000/60000 loss 0.8096
epoch 1/2 iter 20000/60000 loss 0.3663
epoch 1/2 iter 30000/60000 loss 0.3320
epoch 1/2 iter 40000/60000 loss 0.2763
epoch 1/2 iter 50000/60000 loss 0.2308
epoch 1/2 iter 60000/60000 loss 0.2385
epoch 2/2 iter 10000/60000 loss 0.2096
epoch 2/2 iter 20000/60000 loss 0.2044
epoch 2/2 iter 30000/60000 loss 0.1896
epoch 2/2 iter 40000/60000 loss 0.1874
epoch 2/2 iter 50000/60000 loss 0.1692
epoch 2/2 iter 60000/60000 loss 0.1658


In [7]:
correct = 0
total = 0
for images, labels in test_loader:
    images = images.reshape(seq_length, batch_size, input_size).detach().numpy()
    labels = labels.detach().numpy()
    
    hprev = np.zeros((hidden_size, 1))
    cprev = np.zeros((hidden_size, 1))
    outputs = model.forward(images, hprev, cprev)
    pred = model.predict(outputs)
    total += labels.shape[0]
    correct += (pred == labels).sum().item()

print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total)) 

Test Accuracy of the model on the 10000 test images: 94.93 %
