In [3]:
import numpy as np
import pandas as pd
import re

class Vanilla_rnn:
    """
    Vanilla recurrent neural network for language modelling at word level
    Author: Manuel Plank @Manuel030
    loss function calculation as in https://gist.github.com/karpathy/d4dee566867f8291f086
    """
    
    
    def __init__(self, rate, units, seq_length, data):
        """define hyperparameters and data. Data should be string"""
        self.learning_rate = rate
        self.hidden_units = units
        self.seq_length = seq_length
        self.data = data
        self.feature_vec_size = 0
        
    def __preprocess(self):
        tokens = self.data.lower().split()
        pattern = re.compile(r'\w*\.') # identifies sentences separated by points
        data = [word[:-1] if bool(pattern.match(word)) else word for word in tokens] # words only

        # one-hot encode words
        vocabulary = list(set(data))
        self.feature_vec_size = len(vocabulary)
        voc_vec = pd.get_dummies(vocabulary).to_numpy()
        dic = dict(zip(vocabulary, voc_vec))
        # represent data as sequence of vectors
        xs = []
        for word in data:
            xs.append(dic[word])
        return xs
    
    def __feed_forward(self, xs, Wxh, Whh, Why, bh, by):
        
        hs, ys, ps = [], [], []
        loss = 0
        for t in range(len(xs)-1): # -1 because last word has no next word
            # initial hidden state
            if t == 0:
                h_init = np.zeros((self.hidden_units,1))
                hs.append(np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, h_init) + bh))
            # hidden state > 0
            else:
                hs.append(np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)) # previous hidden state influences actual hidden state
            ys.append(np.dot(Why, hs[t]) + by) # unnormalized log probabilities for next chars
            ps.append(np.exp(ys[t]) / np.sum(np.exp(ys[t]))) # probabilities for next chars
            loss += -np.log(ps[t][xs[t+1],0]) # softmax 
        return loss, hs, ys, ps
            
        
    def __backprop(self, xs, hs, ps, Wxh, Whh, Why, bh, by):
        dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
        dbh, dby = np.zeros_like(bh), np.zeros_like(by)
        dhnext = np.zeros_like(hs[0])
        for t in reversed(range(len(xs)-1)):
            # compute how predicted scores should change to decrease loss: derieving gradient simplifies to:
            dy = np.array(ps[t], copy=True)
            dy[xs[t+1]] -= 1 # http://cs231n.github.io/neural-networks-case-study/#grad
            dWhy += np.dot(dy, hs[t].T)
            dby += dy
            dh = np.dot(Why.T, dy) + dhnext # backprop into h
            dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
            dbh += dhraw
            dWxh += np.dot(dhraw, xs[t].T)
            dWhh += np.dot(dhraw, hs[t-1].T)
            dhnext = np.dot(Whh.T, dhraw)
            
            for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
                np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients

        return dWxh, dWhh, dWhy, dbh, dby
    
    def __param_update(self, )
        
    def train(self, echos):
        
        xs = self.__preprocess()
        
        # initially, set weight matrices randomly
        Wxh = np.random.randn(self.hidden_units, self.feature_vec_size) * 0.01
        Whh = np.random.randn(self.hidden_units, self.hidden_units) * 0.01
        Why = np.random.randn(self.feature_vec_size, self.hidden_units) * 0.01
        bh = np.zeros((self.hidden_units, 1)) # hidden bias
        by = np.zeros((self.feature_vec_size, 1)) # output bias
        
        loss, hs, ys, ps = self.__feed_forward(xs, Wxh, Whh, Why, bh, by)
        
        

In [7]:
help(np.clip)

Help on function clip in module numpy:

clip(a, a_min, a_max, out=None, **kwargs)
    Clip (limit) the values in an array.
    
    Given an interval, values outside the interval are clipped to
    the interval edges.  For example, if an interval of ``[0, 1]``
    is specified, values smaller than 0 become 0, and values larger
    than 1 become 1.
    
    Equivalent to but faster than ``np.maximum(a_min, np.minimum(a, a_max))``.
    No check is performed to ensure ``a_min < a_max``.
    
    Parameters
    ----------
    a : array_like
        Array containing elements to clip.
    a_min : scalar or array_like or `None`
        Minimum value. If `None`, clipping is not performed on lower
        interval edge. Not more than one of `a_min` and `a_max` may be
        `None`.
    a_max : scalar or array_like or `None`
        Maximum value. If `None`, clipping is not performed on upper
        interval edge. Not more than one of `a_min` and `a_max` may be
        `None`. If `a_min` or `a

In [29]:
sentence = 'Jane saw Marry. Marry saw James. James saw Doug.'
model = Vanilla_rnn(1e-1, 20, 3, sentence)
model.train(20)

[array([[0.0099956 , 0.01000111, 0.01000344, 0.00999573, 0.01000225,
         0.01000309, 0.00999874, 0.01000191, 0.00999946, 0.01000078,
         0.00999762, 0.00999755, 0.01000365, 0.00999956, 0.01000453,
         0.00999931, 0.01000181, 0.0099992 , 0.01000283, 0.01000259],
        [0.01001087, 0.00999615, 0.00998991, 0.01001052, 0.00999309,
         0.00999084, 0.01000246, 0.00999399, 0.01000053, 0.009997  ,
         0.01000545, 0.01000565, 0.00998935, 0.01000028, 0.00998701,
         0.01000095, 0.00999426, 0.01000124, 0.00999155, 0.00999219],
        [0.00999402, 0.01000162, 0.01000484, 0.00999421, 0.0100032 ,
         0.01000436, 0.00999836, 0.01000273, 0.00999935, 0.01000117,
         0.00999682, 0.00999671, 0.01000513, 0.00999948, 0.01000634,
         0.00999914, 0.01000259, 0.00999899, 0.01000399, 0.01000366],
        [0.00999261, 0.01000208, 0.0100061 , 0.00999284, 0.01000405,
         0.01000549, 0.00999801, 0.01000347, 0.00999925, 0.01000153,
         0.00999609, 0.00999596

In [33]:
help(np.copy)

Help on function copy in module numpy:

copy(a, order='K')
    Return an array copy of the given object.
    
    Parameters
    ----------
    a : array_like
        Input data.
    order : {'C', 'F', 'A', 'K'}, optional
        Controls the memory layout of the copy. 'C' means C-order,
        'F' means F-order, 'A' means 'F' if `a` is Fortran contiguous,
        'C' otherwise. 'K' means match the layout of `a` as closely
        as possible. (Note that this function and :meth:`ndarray.copy` are very
        similar, but have different default values for their order=
        arguments.)
    
    Returns
    -------
    arr : ndarray
        Array interpretation of `a`.
    
    Notes
    -----
    This is equivalent to:
    
    >>> np.array(a, copy=True)  #doctest: +SKIP
    
    Examples
    --------
    Create an array x, with a reference y and a copy z:
    
    >>> x = np.array([1, 2, 3])
    >>> y = x
    >>> z = np.copy(x)
    
    Note that, when we modify x, y changes, but n