## 1.2 Language Models
* Adapt your implementation of logistic regression from Assignment 2 to the implementation of a feedforward neural network (or create a new implementation from scratch). If you use an online implementation as a reference state that in your report and the modifications you have made.
* Your neural network should predict upcoming words from prior word context (see Section 7.5 of the reference book).

## Imports and installation

To use this notebook properly you need to Python, as well as numpy and tqdm.
To install numpy you can simply "pip install numpy", the same goes for tqdm with "pip install tqdm"

tqdm has not been used to perform any calculations, just enclosed the for loops that are training the models to easily track progress. If you do not wish to use tqdm but still want to run the code, then simply remove the function from the batch_training() function.

In [17]:
import numpy as np
from math import exp
from random import random
from tqdm import tqdm

In [23]:
class Neural_Network:
    """Neural Network with 3 layers, embedding, hidden and output"""
    
    class _Embedding_Layer:
        """Fully Connected Layer without bias.
           input x weights -> output"""

        def __init__(self, n_input, n_output, seed=1):
            ran = np.random
            ran.seed(seed)
            self.weights = ran.random_sample((n_output, n_input))

        def feedforward(self, x):
            return np.matmul(self.weights, x.T)
        
    class _Fully_Connected_Layer:
        """Fully Connected Layer with bias.
           input x weights + bias -> output"""

        def __init__(self, n_input, n_output, seed=1):
            ran = np.random
            ran.seed(seed)
            self.weights= ran.random_sample((n_output, n_input))
            self.bias = ran.random_sample(n_output)

        def feedforward(self, x):
            x_w = np.matmul(self.weights, x.T)
            x_w_b = x_w + self.bias
            return x_w_b
    
    # methods used to calculate the activation layers
    
    def _forward_ReLU(self, x):
        # found on stack overflow
        return np.maximum(0, x)

    def _backward_ReLU(self, x):
        # found on stack overflow
        return (x > 0)

    def _forward_softmax(self, x):
        maxx = np.max(x, axis=0, keepdims=True)
        exp = np.exp(x - maxx)
        summ = np.sum(exp, axis=0, keepdims=True)
        return (exp / summ)

    # initialize all necessary information
    def __init__(self, n_vocab, n_embeddings, learning_rate=0.01, n_prev_words=3):
        
        # information about the model
        self.embedding_layer = self._Embedding_Layer(n_vocab, n_embeddings)
        self.hidden_layer = self._Fully_Connected_Layer(n_embeddings * n_prev_words, n_embeddings)
        self.output_layer = self._Fully_Connected_Layer(n_embeddings, n_vocab)
        self.alpha = learning_rate
        
        # z-values representing the results of each layer
        # before activating them, generally means when
        # the input has benn mutiplied by its weights
        self.embedding_z = None
        self.hidden_z =  None
        self.output_z = None
        
        # activation of each layer
        self.embedding_activation = None
        self.hidden_activation = None
        self.output_activation = None
        
        # information about the data
        self.n_vocab = n_vocab
        self.n_embeddings = n_embeddings
        self.n_prev_words = n_prev_words

    # updates layers forward
    def _feedforward(self, x):
        """Method from 7.5.1 in Speech and Language Processing.
           Step by step commented for easy comparison."""
        
        # create embedding matrix
        e = np.array([self.embedding_layer.feedforward(x[i]) for i in range(self.n_prev_words)])
        
        # create embedding layer e
        self.embedding_z = np.concatenate(e, axis=0)
        self.embedding_activation = self.embedding_z
        # Multiply by W and pass through ReLU activation function
        self.hidden_z = self.hidden_layer.feedforward(self.embedding_activation)
        self.hidden_activation = self._forward_ReLU(self.hidden_z)
        
        # Multiply by U and apply softmax reshaping it into |Vocabulary| x 1
        self.output_z = self.output_layer.feedforward(self.hidden_activation)
        self.output_activation = self._forward_softmax(self.output_z).reshape(1, self.n_vocab)

    def _backprop(self, x, y):
        
        # Find the bias of the output layer
        # derivative of cross entropy: y_pred - y
        self.dL_db_output = (self.output_activation - y) * self.output_activation
        self.dL_dw_output = np.matmul(self.dL_db_output.T, self.hidden_activation.reshape(1, -1))

        # use the sum of output layer bias and weights and backwards relu to find hidden layer bias
        output_sum = np.sum(np.matmul(self.dL_db_output, self.output_layer.weights), axis=1, keepdims=True)
        self.hidden_bReLU = self._backward_ReLU(self.hidden_z)
        self.dL_db_hidden = np.matmul(output_sum.reshape(-1, 1), self.hidden_bReLU.reshape(1, -1))
        self.dL_dw_hidden = np.matmul(self.dL_db_hidden.T, self.embedding_activation.reshape(1, -1))
        
        # use the hidden layer weights and bias to calculate the weights of the embeddings layer
        hidden_sum = np.sum(np.matmul(self.dL_db_hidden, self.hidden_layer.weights), axis=1, keepdims=True)
        delta = np.matmul(hidden_sum.reshape(-1, 1), self.embedding_z.reshape(1, -1))
        q = int(delta.shape[1] / self.n_prev_words)
        self.dL_db_embedding = np.array([delta[:, i: i+q] for i in range(self.n_prev_words)]).reshape(self.n_prev_words, self.n_embeddings)
        self.dL_dw_embedding = np.array([np.matmul(self.dL_db_embedding[i].reshape(-1, 1), x[i].reshape(1, -1)) for i in range(self.n_prev_words)])
        
        # finally, update the weights with the new loss gradients
        self._update_weights()
    
    def _update_weights(self):
        # clip embeddings to avoid overflow due to normalizing not working
        
        # update all the embedding weights per previous word
        for i in range(self.n_prev_words):
            self.embedding_layer.weights -= self.alpha * np.clip(self.dL_dw_embedding[i], -1, 1)
            
        # update hidden layer weights and bias
        self.hidden_layer.weights -= self.alpha * np.clip(self.dL_dw_hidden, -1, 1)
        self.hidden_layer.bias -= self.alpha * np.clip(self.dL_db_hidden.flatten(), -1, 1)
        
        # update output layer weights and bias
        self.output_layer.weights -= self.alpha * np.clip(self.dL_dw_output, -1, 1)
        self.output_layer.bias -= self.alpha * np.clip(self.dL_db_output.flatten(), -1, 1)

    def fit(self, x, y, n_epochs=1):
        self._feedforward(x)
        self._backprop(x, y)
            
        return self.output_activation
        
    def predict(self, x):
        self._feedforward(x)
        return self.output_layer_activation


In [24]:
with open("metamorphosis.txt",'r',encoding='utf-8') as file:
    raw_metamorphosis = file.read()
    
replacements = ['\n', '\t', '\r', ';', ':', '.', ',', '"', "!", '?', '-', "'", '  ']

stripped_metamorphosis = raw_metamorphosis

for rep in replacements:
    stripped_metamorphosis = stripped_metamorphosis.replace(rep, ' ')

stripped_metamorphosis = stripped_metamorphosis.lower().split()

metamorphosis_vocab = list(set(stripped_metamorphosis))
meta_vocab_dic = {}
one_hots = {}
for i, word in enumerate(metamorphosis_vocab):
    one_hot_vector = np.zeros(len(metamorphosis_vocab))
    one_hot_vector[i] = 1
    meta_vocab_dic[word] = one_hot_vector

print(len(stripped_metamorphosis))
print(len(meta_vocab_dic))

22041
2805


## Create datasets and vocab dictionaries
As done in Word2Vec

In [25]:
with open("frankenstein.txt",'r',encoding='utf-8') as file:
    raw_frankenstein = file.read()
    
replacements = ['\n', '\t', '\r', ';', ':', '.', ',', '"', "!", '?', '-', "'", '  ']

stripped_frankenstein = raw_frankenstein

for rep in replacements:
    stripped_frankenstein = stripped_frankenstein.replace(rep, ' ')

stripped_frankenstein = stripped_frankenstein.lower().split()

frankenstein_vocab = list(set(stripped_frankenstein))
frank_vocab_dic = {}
one_hots = {}
for i, word in enumerate(frankenstein_vocab):
    one_hot_vector = np.zeros(len(frankenstein_vocab))
    one_hot_vector[i] = 1
    frank_vocab_dic[word] = one_hot_vector

print(len(stripped_frankenstein))
print(len(frank_vocab_dic))

75453
7360


## Train the data

In [26]:
def batch_training(model, dataset, vocab, batch_size=5000, n_prev_words=3):
    """Method that trains a model, its dataet and vocabulary in batches to
    avoid memory overflow and give feedback during the training"""
    n_vocab = len(vocab)
    n_dataset = len(dataset) - n_prev_words
    n_batches = n_dataset // batch_size
    
    for batch in range(n_batches):
        batch_len = batch_size - n_prev_words
        data = np.zeros((batch_len, n_prev_words, n_vocab), dtype='float16')
        labels = np.zeros((batch_len, n_vocab), dtype='int16')
        for i in range(batch_size - n_prev_words):
            x = np.zeros((n_prev_words, n_vocab))
            for j in range(n_prev_words):
                x[j] = vocab[dataset[(batch*batch_size):(batch*batch_size)+batch_size][i+j]]
            data[i] = x
            labels[i] = vocab[dataset[(batch*batch_size):(batch*batch_size)+batch_size][i+n_prev_words]]

        for x, y in tqdm(zip(data, labels)):
            result = model.fit(x, y)
        print(result)

In [27]:
metamorphosis_model = Neural_Network(len(meta_vocab_dic), 64)
batch_training(metamorphosis_model, stripped_metamorphosis, meta_vocab_dic)

frankenstein_model = Neural_Network(len(frank_vocab_dic), 64)
batch_training(frankenstein_model, stripped_frankenstein, frank_vocab_dic, batch_size=7000)


4997it [00:33, 149.10it/s]


[[0.00048461 0.00019245 0.00038576 ... 0.00031336 0.0002925  0.00039608]]


4997it [00:34, 145.13it/s]


[[0.00050185 0.00024333 0.00043264 ... 0.00023531 0.00036686 0.00045951]]


4997it [00:37, 134.55it/s]


[[0.00052805 0.00026207 0.00037034 ... 0.00022189 0.00039551 0.0004552 ]]


4997it [00:32, 155.47it/s]


[[0.00054611 0.00023173 0.00043494 ... 0.00023997 0.00034586 0.0004149 ]]


6997it [01:57, 59.59it/s]


[[9.51868655e-05 1.01136450e-04 7.73743662e-05 ... 1.45184293e-04
  1.36807658e-04 1.44104869e-04]]


6997it [01:58, 58.82it/s]


[[4.71578858e-05 1.71733248e-04 1.04465026e-04 ... 3.51690689e-04
  4.08450699e-05 1.05157456e-04]]


6997it [02:00, 57.93it/s]


[[1.13476230e-04 1.18323857e-04 9.53388106e-05 ... 1.76735250e-04
  1.28956548e-04 1.33306876e-04]]


6997it [01:59, 58.34it/s]


[[1.01195393e-04 7.96782763e-05 6.91902425e-05 ... 2.32026968e-04
  1.30209744e-04 1.39806600e-04]]


6997it [01:58, 59.19it/s]


[[1.10953858e-04 1.17285311e-04 9.59584451e-05 ... 1.73749217e-04
  1.29756092e-04 1.31393169e-04]]


6997it [02:01, 57.65it/s]


[[1.28951884e-04 1.20730306e-04 8.14509153e-05 ... 1.82884433e-04
  1.31674030e-04 1.28150496e-04]]


6997it [01:57, 59.36it/s]


[[0.00014938 0.0001984  0.00010205 ... 0.00016767 0.00011821 0.00011059]]


6997it [02:02, 56.91it/s]


[[0.00011006 0.00011831 0.00010282 ... 0.00016098 0.0001375  0.00012448]]


6997it [01:52, 62.36it/s]


[[1.13912930e-04 1.21513082e-04 9.00459325e-05 ... 1.63825036e-04
  1.22396469e-04 1.26048449e-04]]


6997it [02:22, 48.99it/s]

[[1.09850807e-04 1.18779548e-04 9.68671124e-05 ... 1.75219342e-04
  1.30749421e-04 1.32834846e-04]]





## Store the embeddings in TSV files
This is done so we get files that we can use in the Embedding Projector

In [28]:
out_metamorphosis_v = open('vectors_m.tsv', 'w', encoding='utf-8')
out_metamorphosis_m = open('metadata_m.tsv', 'w', encoding='utf-8')

for index, word in enumerate(metamorphosis_vocab):
    # get the index of all the dimensions to form a single vector
    out_metamorphosis_v.write('\t'.join([str(x) for x in metamorphosis_model.embedding_layer.weights[:, index]]) + "\n")
    out_metamorphosis_m.write(word + "\n")
out_metamorphosis_v.close()
out_metamorphosis_m.close()

In [29]:
out_frankenstein_v = open('vectors_f.tsv', 'w', encoding='utf-8')
out_frankenstein_m = open('metadata_f.tsv', 'w', encoding='utf-8')

for index, word in enumerate(frankenstein_vocab):
    out_frankenstein_v.write('\t'.join([str(x) for x in frankenstein_model.embedding_layer.weights[:, index]]) + "\n")
    out_frankenstein_m.write(word + "\n")
out_frankenstein_v.close()
out_frankenstein_m.close()