# Base class for layers

In [2]:
# Import
import numpy as np
import matplotlib.pyplot as plt

In [15]:
class Module:
    
    def __init__(self, seq_len, hidden_sz, vocab_sz, weights, hidden):
        self.seq_len = seq_len
        self.hidden_sz = hidden_sz
        self.vocab_sz = vocab_sz
        
        self.params = dict() # Dictionary of parameters including weights and gradients
        self.hidden = dict() # Dictionary of hidden
        self.sm_ps = dict()  # Dictionary of softmax pro values
        
        
        self.make_hidden_dict(hidden)
        self.make_params_dict(weights)
        
        # Initialize weights, hidden and cell states.
        self.init_weights()
        self.reset_hidden()
        
        # Start with zero loss
        self.loss = 0
        
        # Start with no adagrad memory setup
        self.adagrad_mem = False
        
    def make_params_dict(self, weights):
        """
        Set up the dictionary to contain the model weights, biases and gradients.
        Based on the weights the model needs.
        """
        
        for weight in weights:     
            if weight == 'Why':
                size = (self.vocab_sz, self.hidden_sz)
            elif weight.startswith('Wx'):
                size = (self.hidden_sz, self.vocab_sz)
            elif 'B' in weight and weight != 'By':
                size = (self.hidden_sz, 1)
            elif 'B' in weight and weight == 'By':
                size = (self.vocab_sz, 1)
            else:
                size = (self.hidden_sz, self.hidden_sz)
            
            self.params[weight] = {'size': size}
        
    def init_weights(self):
        """
        Initializes weights and biases based on the inputs hidden sz and
        vocab_sz.
        """
        
        for param in self.params:    
            # Initialize a weight matrix
            x,y = self.params[param]['size']
            if y != 1:
                self.params[param]['weight'] = np.random.randn(x, y) * 0.01
            # Initialize a bias
            else:
                self.params[param]['bias'] = np.zeros((x, y))
                
    def init_grads(self):
        """
        Initialize gradients for biases and weights
        """
        for param in self.params:
            # Initialize gradients for weights
            if 'weight' in self.params[param].keys():
                self.params[param]['grad'] = np.zeros_like(self.params[param]['weight'])
            # Initialize gradients for biases
            else:
                self.params[param]['grad'] = np.zeros_like(self.params[param]['bias'])
                
    def init_adagrad_mem(self):
        """
        Initialize memory matrices needed for Adagrad.
        """
        
        self.adagrad_mem = True
        
        for param in self.params:
            # Initialize gradients for weights
            if 'weight' in self.params[param].keys():
                self.params[param]['ada_mem'] = np.zeros_like(self.params[param]['weight'])
            # Initialize gradients for biases
            else:
                self.params[param]['ada_mem'] = np.zeros_like(self.params[param]['bias'])
                
    def update_grads(self, optimizer, lr):
        """
        Update gradients based on the optimizer you choose.
        """

        if optimizer == 'Adagrad':
            if not self.adagrad_mem:
                self.init_adagrad_mem()
                
            for param in self.params:
                mem = self.params[param]['ada_mem']
                grad = self.params[param]['grad']
                self.params[param]['ada_mem'] += grad * grad
                
                # Update weight
                if 'weight' in self.params[param].keys():
                    self.params[param]['weight'] += -1 * lr * grad / np.sqrt(mem + 1e-8)
                # Update bias
                else:
                    self.params[param]['bias'] += -1 * lr * grad / np.sqrt(mem + 1e-8)
    
    def make_hidden_dict(self, hidden):
        """
        Create a dictionary of hidden layers
        """
        
        for layer in hidden:
            self.hidden[layer] = {}
            
    
    def reset_hidden(self):
        """
        Reset hidden layers and possible cell state
        """
        for layer in self.hidden.keys():
            self.hidden[layer][-1] = np.zeros((self.hidden_sz, 1))
        

    def plot_losses(self):
        """
        Plot the cross entropy loss against the number of training sequences
        """

        if hasattr(self, 'losses'):
            plt.plot(self.losses)
            plt.xlabel('Number of training sequences')
            plt.ylabel('Cross Entropy Loss')
            plt.show()
        else:
            print('Error: No losses recorded, train the model!')

    def train(self, data, optimizer, lr, epochs, progress=True):
        """
        Train the model by chopping the data in sequences followed by performing
        the forward pass, backward pass and update the gradients.
        """
        self.losses = []
        smooth_loss = -np.log(1.0 / self.vocab_sz) * self.seq_len # Loss at iteration 0

        # Loop over the amount of epochs
        for epoch in range(epochs):
            n = 0

            # Reset hidden state
            self.reset_hidden()

            data_len = len(data)

            # Loop over amount of sequences in the data
            sequences_amount = int(data_len // self.seq_length)
            for j in range(sequences_amount):

                start_pos = self.seq_length * j

                # Embed the inputs and targets
                xs = [char_to_idx[ch] for ch in data[start_pos:start_pos + self.seq_length]]
                targets = [char_to_idx[ch] for ch in data[start_pos + 1:start_pos + self.seq_length + 1]]

                # Forward pass
                self.forward(xs, targets)

                # Backward pass
                self.backward(xs, targets)

                # Update weight matrices
                self.update_grads(optimizer, lr)

                smooth_loss = smooth_loss * 0.999 + self.loss * 0.001

                if progress and n % 1000 == 0:
                    print(f'Epoch {epoch + 1}: {n} / {sequences_amount}: {smooth_loss}')

                n += 1
                self.losses.append(smooth_loss)
                
    def predict(self, start, n):
        """
        Predict a sequence of text based on a starting string.
        """
        seed_idx = char_to_idx[start[-1]]
        x = np.zeros((self.vocab_sz, 1))
        x[seed_idx] = 1
        
        txt = [ch for ch in start]
        
        idxes = []
        
        h = self.hs[-1]
        
        for i in range(n):
            
            # Calculate the hidden
            h = np.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
            # Calculate y
            y = np.dot(self.Why, h) + self.by

            sm_p = np.exp(y) / np.sum(np.exp(y)) # Softmax probabilty
            # Determine character based on weighted probability (is using the softmax probability)
            idx = np.random.choice(range(self.vocab_sz), p=sm_p.ravel())
            idxes.append(idx)
            
            # Save X for next iteration
            x = np.zeros((self.vocab_sz, 1))
            x[idx] = 1
            
        prediction = [idx_to_char[idx] for idx in idxes]
        
        txt += prediction
        
        return txt

In [20]:
# Hidden en weights zijn constanten die in de LSTM en RNN class in de __init__ komen
weights = ['Whf', 'Wxf', 'Whi', 'Wxi', 'Whc', 'Wxc', 'Who', 'Wxo', 'Why', 'Bf', 'Bi', 'Bc', 'Bo', 'By']
hidden = ['hs', 'cs']
model = Module(5, 100, 20, weights, hidden)

In [19]:
print(model.params)

{'Whf': {'size': (100, 20), 'weight': array([[-0.00304976, -0.00675038,  0.01596411, ..., -0.00702607,
        -0.00609263, -0.00417255],
       [-0.01264808, -0.00246294, -0.01693385, ..., -0.0001602 ,
        -0.00238271, -0.01022185],
       [ 0.00686833, -0.00011165,  0.01000822, ...,  0.01092285,
        -0.00114947, -0.0118128 ],
       ...,
       [-0.00264961,  0.00739856,  0.00607594, ...,  0.01108103,
         0.01103244,  0.00167077],
       [-0.01177754, -0.01574602,  0.00912002, ..., -0.0038583 ,
        -0.01295316,  0.01479103],
       [ 0.00436472, -0.00978523,  0.00086325, ..., -0.00534984,
         0.01138948,  0.0050954 ]])}, 'Wxf': {'size': (100, 20), 'weight': array([[ 0.0105937 , -0.00605487,  0.00479676, ...,  0.00859244,
         0.00859256, -0.00942189],
       [ 0.0025631 ,  0.0069594 , -0.00420536, ..., -0.0166504 ,
        -0.00068816, -0.00756637],
       [-0.00016842, -0.00254862, -0.00710777, ...,  0.00517503,
        -0.006004  ,  0.00841172],
       ...

In [18]:
model.init_grads()

In [19]:
print(model.params)

{'Whf': {'size': (100, 20), 'weight': array([[ 1.44446053e-02,  2.06660422e-02,  6.50326560e-03, ...,
        -4.03576532e-03,  6.67477618e-03, -2.25928479e-03],
       [-7.84570139e-05, -1.05403123e-02,  8.88499052e-03, ...,
         4.22230090e-03,  1.47117127e-03,  3.72120459e-03],
       [ 5.33094703e-03, -1.46263679e-03, -1.37004501e-02, ...,
        -1.51425213e-02, -8.34850376e-03, -9.70735545e-03],
       ...,
       [-1.10486107e-02,  4.21135878e-04, -8.77209253e-04, ...,
         2.03694610e-03,  1.05244597e-02,  2.36969916e-03],
       [-3.79504660e-03,  2.23617332e-02,  5.24531455e-03, ...,
        -3.91039904e-03, -2.67009575e-03,  2.05498165e-02],
       [ 1.71932254e-02, -2.83603611e-03, -1.44486653e-02, ...,
         1.57463340e-02,  6.99741730e-03, -4.46841916e-03]]), 'grad': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0.,