In [69]:
import numpy as np
class Tensor(object):
    def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
        self.autograd = autograd
        self.children = {}
        if (id is None):
            id = np.random.randint(0, 100000)
        self.id = id
        
        if (creators is not None):
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
    
    def all_children_grads_accounted_for(self):
        for id,cnt in self.children.items():
            if (cnt != 0):
                return False
        return True
        
    def backward(self, grad=None, grad_origin=None):
        if (self.autograd):
            if (grad is None):
                grad = Tensor(np.ones_like(self.data))
            
            if (grad_origin is not None):
                if (self.children[grad_origin.id] == 0):
                    return
                    print(self.id)
                    print(self.creation_op)
                    print(len(self.creators))
                    for c in self.creators:
                        print(c.creation_op)
                    raise Exception("cannot backprop more than once")
                else:
                    self.children[grad_origin.id] -= 1
            
            if (self.grad is None):
                self.grad = grad
            else:
                self.grad += grad
                
            assert grad.autograd == False
       
            if (self.creators is not None and (self.all_children_grads_accounted_for() or
                                               grad_origin is None)):
                if (self.creation_op == "add"):
                    self.creators[0].backward(grad, self)
                    self.creators[1].backward(grad, self)
                    
                if (self.creation_op == "neg"):
                    self.creators[0].backward(self.grad.__neg__())
                    
                if (self.creation_op == "sub"):
                    new = Tensor(self.grad.data)
                    self.creators[0].backward(new, self)
                    new = Tensor(self.grad.__neg__().data)
                    self.creators[1].backward(new, self)
                    
                if (self.creation_op == "mul"):
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new, self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)
                    
                if (self.creation_op == "mm"):
                    act = self.creators[0]
                    weights = self.creators[1]
                    new = self.grad.mm(weights.transpose())
                    act.backward(new)
                    new = self.grad.transpose().mm(act).transpose()
                    weights.backward(new)
                    
                if (self.creation_op == "transpose"):
                    self.creators[0].backward(self.grad.transpose())
                    
                if (self.creation_op == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))
                    
                if (self.creation_op == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (ones - (self * self)))
                    
                if (self.creation_op == "index_select"):
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))
                    
                if (self.creation_op == "cross_entropy"):
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))
                    
                if ("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    ds = self.creators[0].data.shape[dim]
                    self.creators[0].backward(self.grad.expand(dim, ds))
                    
                if ("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))
    
    def __add__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data + other.data, autograd=True, creators=[self,other],
                          creation_op="add")
        return Tensor(self.data + other.data)
    
    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())
    
    def __neg__(self):
        if (self.autograd):
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)
    
    def __sub__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data - other.data, autograd=True, creators=[self,other], 
                          creation_op="sub")
        return Tensor(self.data - other.data)
    
    def __mul__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data * other.data, autograd=True, creators=[self,other],
                          creation_op="mul")
        return Tensor(self.data * other.data)
    
    def sum(self, dim):
        if (self.autograd):
            return Tensor(self.data.sum(dim), autograd=True, creators=[self],
                          creation_op="sum_"+str(dim))
        return Tensor(self.data.sum(dim))
    
    def expand(self, dim, copies):
        trans_cmd = list(range(0, len(self.data.shape)))
        trans_cmd.insert(dim, len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape)
        new_data = new_data.transpose(trans_cmd)
        
        if (self.autograd):
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dim))
        return Tensor(new_data)
    
    def transpose(self):
        if (self.autograd):
            return Tensor(self.data.transpose(), autograd=True, creators=[self],
                          creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if (self.autograd):
            return Tensor(self.data.dot(x.data), autograd=True, creators=[self,x], creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if (self.autograd):
            return Tensor(1 / (1 + np.exp(-self.data)), autograd=True, creators=[self],
                          creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))
    
    def tanh(self):
        if (self.autograd):
            return Tensor(np.tanh(self.data), autograd=True, creators=[self], creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    ############################################################################
    def softmax(self):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp, axis=len(self.data.shape)-1, keepdims=True)
        return softmax_output
    ############################################################################
    
    def index_select(self, indices):
        if (self.autograd):
            new = Tensor(self.data[indices.data], autograd=True, creators=[self], 
                         creation_op="index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])
    
    def cross_entropy(self, target_indices):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp, axis=len(self.data.shape)-1, keepdims=True)
        
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t), -1)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()
        
        if (self.autograd):
            out = Tensor(loss, autograd=True, creators=[self], creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out
        
        return Tensor(loss)
    
    
class SGD(object):
    
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
        
    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0
            
    def step(self, zero=True):
        for p in self.parameters:
            p.data -= p.grad.data * self.alpha
            
            if (zero):
                p.grad.data *= 0
           
        
class Layer(object):
    
    def __init__(self):
        self.parameters = list()
        
    def get_parameters(self):
        return self.parameters
    
    
class Linear(Layer):
    
    def __init__(self, n_inputs, n_outputs, bias=True):
        super().__init__()
        
        self.use_bias = bias
        
        W = np.random.randn(n_inputs, n_outputs)*np.sqrt(2.0/(n_inputs))
        self.weight = Tensor(W, autograd=True)
        if (self.use_bias):
            self.bias = Tensor(np.zeros(n_outputs), autograd=True)
        
        self.parameters.append(self.weight)
        
        if (self.use_bias):
            self.parameters.append(self.bias)
        
    def forward(self, inp):
        if (self.use_bias):
            return inp.mm(self.weight) + self.bias.expand(0, len(inp.data))
        return inp.mm(self.weight)
    
    
class Sequential(Layer):
    def __init__(self, layers=list()):
        super().__init__()
        
        self.layers = layers
    
    def add(self, layer):
        self.layers.append(layer)
    
    def forward(self, inp):
        for layer in self.layers:
            inp = layer.forward(inp)
        return inp
    
    def get_parameters(self):
        params = list()
        for l in self.layers:
            params += l.get_parameters()
        return params
    
    
class MSELoss(Layer):
    
    def __init__(self):
        super().__init__()
        
    def forward(self, pred, target):
        return ((pred - target) * (pred - target)).sum(0)
    
    
class Tanh(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, inp):
        return inp.tanh()
    
    
class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
        
    def forward(self, inp):
        return inp.sigmoid()

    
class Embedding(Layer):
    def __init__(self, vocab_size, dim):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dim = dim
        
        weight = (np.random.rand(vocab_size, dim) - 0.5) / dim
        
        # with index_select method created we can forward prop
        self.weight = Tensor(weight, autograd=True)
        self.parameters.append(self.weight)
        
    def forward(self, inp):
        return self.weight.index_select(inp)
    
    
class CrossEntropyLoss(object):
    def __init__(self):
        super().__init__()
        
    def forward(self, inp, target):
        return inp.cross_entropy(target)
    
    
class RNNCell(Layer):
    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        super().__init__()
        
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        
        if (activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif (activation == 'tanh'):
            self.activation = Tanh()
        else:
            raise Exception("Non-linearity not found")
            
        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)
        
        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()
        
    def forward(self, inp, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(inp) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden
    
    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)

In [37]:
import sys, random, math
from collections import Counter
import numpy as np

f = open('shakespeare.txt', 'r')
raw = f.read()
f.close()

vocab = list(set(raw))
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i
indices = np.array(list(map(lambda x:word2index[x], raw)))

In [38]:
embed = Embedding(vocab_size=len(vocab), dim=512)
model = RNNCell(n_inputs=512, n_hidden=512, n_output=len(vocab))

criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

In [39]:
batch_size = 32
bptt = 16
n_batches = int((indices.shape[0] / (batch_size)))

In [40]:
trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int(((n_batches-1) / bptt))
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
target_batches = target_batched_indices[:n_bptt*bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)

In [41]:
print(raw[0:5])
print(indices[0:5])

First
[29 10 50 23 60]


In [42]:
print(batched_indices[0:5])

[[29 63 30  5 48 35 30  3 30 30 23 38 16 27  3  3  8  8  3  8 58 16 16 30
  16  8  8 61 52  8 19 58]
 [10  3 32  3  0  2  7 52  5 63  8  3  0 24  7 37 30 59 25 25 60 16 23  7
  16 59 18 50 53  8 30 53]
 [50 50  3 23 30 46 58 30  3  3 58 50 27  5 14 48 63  0  0  8 30 30 30  3
  30 27 30  3  5 30 63  8]
 [23 30 50  8  3 13 25 38 38  3  0 23 31 58 10 41 16 30 30  7 10 37 14 50
   3 61 10 39 30 14 16 50]
 [60  2 10 30 50 17 30  8 30 60 27  8  3 60 25 30 58 63 63 19 23  3  8  8
  25  8 63  3 19  8 58  0]]


In [43]:
def train(iterations=100):
    for iter in range(iterations):
        total_loss = 0
        n_loss = 0
        
        hidden = model.init_hidden(batch_size=batch_size)
        for batch_i in range(len(input_batches)):
            
            hidden = Tensor(hidden.data, autograd=True)
            loss = None
            losses = list()
            for t in range(bptt):
                inp = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(inp=inp)
                output, hidden = model.forward(inp=rnn_input, hidden=hidden)
                
                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                losses.append(batch_loss)
                if (t == 0):
                    loss = batch_loss
                else:
                    loss = loss + batch_loss
            for loss in losses:
                ""
            loss.backward()
            optim.step()
            total_loss += loss.data
            log = "\r Iter: " + str(iter)
            log += " - Batch " + str(batch_i+1) + "/" + str(len(input_batches))
            log += " - Loss: " + str(np.exp(total_loss / (batch_i+1)))
            if (batch_i == 0):
                log += " - " + generate_sample(70, '\n'.replace("\n", " "))
            if (batch_i % 10 == 0 or batch_i-1 == len(input_batches)):
                sys.stdout.write(log)
        optim.alpha *= 0.99
        print()
        
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    inp = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(inp)
        output, hidden = model.forward(inp=rnn_input, hidden=hidden)
        output.data *= 10
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()
        
        m = (temp_dist > np.random.rand()).argmax()
        c = vocab[m]
        inp = Tensor(np.array([m]))
        s += c
        
    return s

In [44]:
train()

 Iter: 0 - Batch 2171/2178 - Loss: 13.659292582387012 ,   ,              ,            ,    ,               ,  ,           
 Iter: 1 - Batch 2171/2178 - Loss: 8.849821110311202 ther and and and and and and and and and and and and and and and and a
 Iter: 2 - Batch 2171/2178 - Loss: 7.8622356204813715he so s,er and and and and and and and and and and and ,o h, and and 
 Iter: 3 - Batch 2171/2178 - Loss: 7.2725368280760465, word and and and and and and ,o ,o ,o steave and ,o ,o ,o ,o stea, 
 Iter: 4 - Batch 2171/2178 - Loss: 6.8490762174687035he somes and ,o ,o ,o s,eath the ,o so so ,o son ,o ,o s,eath the son
 Iter: 5 - Batch 2171/2178 - Loss: 6.5129394450884655onges and and ,o have and the so, and ,o st, and and the so, and the 
 Iter: 6 - Batch 2171/2178 - Loss: 6.2335223021547965nd and the souse ,o bear and so the so, and the so, and so ,o so the
 Iter: 7 - Batch 2171/2178 - Loss: 5.9953072949702145, and the souse ,o s,, and the souse ,o stear and the souse and the s
 Iter: 8 - Batch

 Iter: 60 - Batch 2171/2178 - Loss: 2.6511611700861915
 Iter: 61 - Batch 2171/2178 - Loss: 2.6268951387893665nswer his face is the worst cousing ,ing h, were it ,ishous far one i
 Iter: 62 - Batch 2171/2178 - Loss: 2.5955159215078845a souls ,o the wor, and then, not an old be conscor, and then, not be 
 Iter: 63 - Batch 2171/2178 - Loss: 2.5543273260791914ou best us the words, we have ep,o have a mean thou , all the sup ,o 
 Iter: 64 - Batch 1/2178 - Loss: 2.768538778381009 - a,
As I see his hangman:
 Iter: 64 - Batch 2171/2178 - Loss: 2.5455763033154697
 Iter: 65 - Batch 2171/2178 - Loss: 2.5400721002849923nswer, and all the sup the s, what I would ,ill's enterselves and wor
 Iter: 66 - Batch 2171/2178 - Loss: 2.5440454844255095an a ,ime, and whe, and whe, and w, and won awfixchise and wonder the 
 Iter: 67 - Batch 2171/2178 - Loss: 2.4928955805864996re bus not be rest me and words again. I know not be rest me and word
 Iter: 68 - Batch 2171/2178 - Loss: 2.5108663151053873 see his for

In [46]:
print(generate_sample(n=2000, init_char='\n'))

Answer and love shall p,
As I bes, thou hast unto himse would be courter o, the shep,
As I sent her and world,
I wo, the such she sent from and world,
I won, besul and I comes before but not ,well have , and ,, a, send t, there is the name is sements and my s, and my wi, her
being sir, work in the worst, how in the worst, how in the worst, how in the bed,
And I find are , as I su, the such she sent her and ,u, her
bentreat you ,
And there ,'st my grace.

ROMEO:
And his grucking bound ,'bould be courter of my see his , besued' the sheptern thou hast unto him w, and regens, and my wife, hereafer the such she sent from and world,
I won, besul and I comes before but not ,well hope hath past, on the name is , ,'Tis such , be, and my , are I love shall part with the ,using of , as I sent thou not
felth!

VOLUMNIAN:
Uldespy in the grance. What a death,
In all, awake! her
bring should be ,iture, the such she sent ,' is grace of could be courter o, the,, and my will be debent h, who do t, and ,

# Small backpropagation with RNN

In [47]:
(sigmoid, relu) = (lambda x:1/(1-np.exp(-x)), lambda x:(x>0).astype(float)*x)
weights = np.array([[1,4],[4,1]])
activation = sigmoid(np.array([1,0.01]))

print("Sigmoid Activations")
activations = list()
for iter in range(10):
    activation = sigmoid(activation.dot(weights))
    activations.append(activation)
    print(activation)
print("\nSigmoid Gradient")
gradient = np.ones_like(activation)
for activation in reversed(activations):
    gradient = (activation * (1 - activation) * gradient)
    gradient = gradient.dot(weights.transpose())
    print(gradient)

print("Activations")
activations = list()
for iter in range(10):
    activation = relu(activation.dot(weights))
    activations.append(activation)
    print(activation)
print("\nGradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
    gradient = ((activation > 0) * gradient).dot(weights.transpose())
    print(gradient)

Sigmoid Activations
[1. 1.]
[1.00678365 1.00678365]
[1.00655594 1.00655594]
[1.00656346 1.00656346]
[1.00656321 1.00656321]
[1.00656322 1.00656322]
[1.00656322 1.00656322]
[1.00656322 1.00656322]
[1.00656322 1.00656322]
[1.00656322 1.00656322]

Sigmoid Gradient
[-0.03303147 -0.03303147]
[0.00109108 0.00109108]
[-3.60399005e-05 -3.60399005e-05]
[1.19045078e-06 1.19045078e-06]
[-3.93223372e-08 -3.93223372e-08]
[1.2988729e-09 1.2988729e-09]
[-4.29052588e-11 -4.29052588e-11]
[1.41564198e-12 1.41564198e-12]
[-4.83418583e-14 -4.83418583e-14]
[0. 0.]
Activations
[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15625.]
[78125. 78125.]
[390625. 390625.]
[1953125. 1953125.]
[9765625. 9765625.]

Gradients
[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15625.]
[78125. 78125.]
[390625. 390625.]
[1953125. 1953125.]
[9765625. 9765625.]


# LSTM

In [48]:
def forward(self, inp, hidden):
    prev_hidden, prev_cell = (hidden[0], hidden[1])
    
    f = (self.xf.forward(inp) + self.hf.forward(prev_hidden)).sigmoid() # 'forget' gate
    i = (self.xi.forward(inp) + self.hi.forward(prev_hidden)).sigmoid() # input gate
    o = (self.xo.forward(inp) + self.ho.forward(prev_hidden)).sigmoid() # output gate
    u = (self.xc.forward(inp) + self.hc.forward(prev_hidden)).tanh()    # update gate
    cell = (f * prev_cell) + (i * u)
    h = o * cell.tanh()
    output = self.w_ho.forward(h)
    return output, (h, cell)

In [70]:
class LSTMCell(Layer):
    def __init__(self, n_inputs, n_hidden, n_output):
        super().__init__()
        
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        
        self.xf = Linear(n_inputs, n_hidden)
        self.xi = Linear(n_inputs, n_hidden)
        self.xo = Linear(n_inputs, n_hidden)
        self.xc = Linear(n_inputs, n_hidden)
        self.hf = Linear(n_inputs, n_hidden, bias=False)
        self.hi = Linear(n_inputs, n_hidden, bias=False)
        self.ho = Linear(n_inputs, n_hidden, bias=False)
        self.hc = Linear(n_inputs, n_hidden, bias=False)
        
        self.w_ho = Linear(n_hidden, n_output, bias=False)
        
        self.parameters += self.xf.get_parameters()
        self.parameters += self.xi.get_parameters()
        self.parameters += self.xo.get_parameters()
        self.parameters += self.xc.get_parameters()
        self.parameters += self.hf.get_parameters()
        self.parameters += self.hi.get_parameters()
        self.parameters += self.ho.get_parameters()
        self.parameters += self.hc.get_parameters()
        
        self.parameters += self.w_ho.get_parameters()
        
    def forward(self, inp, hidden):
        prev_hidden = hidden[0]
        prev_cell = hidden[1]
    
        f = (self.xf.forward(inp) + self.hf.forward(prev_hidden)).sigmoid() # 'forget' gate
        i = (self.xi.forward(inp) + self.hi.forward(prev_hidden)).sigmoid() # input gate
        o = (self.xo.forward(inp) + self.ho.forward(prev_hidden)).sigmoid() # output gate
        g = (self.xc.forward(inp) + self.hc.forward(prev_hidden)).tanh()
        c = (f * prev_cell) + (i * g)
        h = o * c.tanh()
        
        output = self.w_ho.forward(h)
        return output, (h, c)
    
    def init_hidden(self, batch_size=1):
        h = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        c = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        h.data[:,0] += 1
        c.data[:,0] += 1
        return (h, c)

In [79]:
import sys, random, math
from collections import Counter
import numpy as np
import sys

np.random.seed(0)

f = open('shakespeare.txt', 'r')
raw = f.read()
f.close()

vocab = list(set(raw))
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i
indices = np.array(list(map(lambda x:word2index[x], raw)))

embed = Embedding(vocab_size=len(vocab), dim=512)
model = LSTMCell(n_inputs=512, n_hidden=512, n_output=len(vocab))
model.w_ho.weight.data *= 0

criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

batch_size = 16
bptt = 25
n_batches = int((indices.shape[0] / (batch_size)))

trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int(((n_batches - 1) / bptt))
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt, bptt, batch_size)
target_batches = target_batched_indices[:n_bptt*bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)
min_loss = 1000

In [80]:
def train(iterations=100):
    for iter in range(iterations):
        total_loss, n_loss = (0,0)

        hidden = model.init_hidden(batch_size=batch_size)
        batches_to_train = len(input_batches)

        for batch_i in range(batches_to_train):

            hidden = (Tensor(hidden[0].data, autograd=True), Tensor(hidden[1].data, autograd=True))
            losses = list()

            for t in range(bptt):
                inp = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(inp=inp)
                output, hidden = model.forward(inp=rnn_input, hidden=hidden)

                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)

                if (t == 0):
                    losses.append(batch_loss)
                else:
                    losses.append(batch_loss + losses[-1])    
            loss = losses[-1]

            loss.backward()
            optim.step()

            total_loss += loss.data / bptt
            epoch_loss = np.exp(total_loss / (batch_i + 1))
            if (epoch_loss < min_loss):
                min_loss = epoch_loss
                print()
            log = "\r Iter: " + str(iter)
            log += " - Alpha: " + str(optim.alpha)[0:5]
            log += " - Batch " + str(batch_i+1) + "/" + str(len(input_batches))
            log += " - Min Loss: " + str(min_loss)[0:5]
            log += " - Loss: " + str(epoch_loss)
            if (batch_i == 0):
                s = generate_sample(n=70, init_char="T").replace("\n", " ")
                log += " - " + s
            if (batch_i % 1 == 0):
                sys.stdout.write(log)
        optim.alpha *= 0.99

In [81]:
train(10)

UnboundLocalError: local variable 'min_loss' referenced before assignment

In [None]:
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    inp = Tensor(np.array([word2index[init_char]]))
    for i in range(n):
        rnn_input = embed.forward(inp)
        output, hidden = model.forward(inp=rnn_input, hidden=hidden)
        output.data *= 15
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()
        
        m = output.data.argmax()
        c = vocab[m]
        inp = Tensor(np.array([m]))
        s += c
    return s