## 1.2 Language Models
* Adapt your implementation of logistic regression from Assignment 2 to the implementation of a feedforward neural network (or create a new implementation from scratch). If you use an online implementation as a reference state that in your report and the modifications you have made.
* Your neural network should predict upcoming words from prior word context (see Section 7.5 of the reference book).

## Imports and installation

To use this notebook properly you need to Python, as well as numpy and tqdm.
To install numpy you can simply "pip install numpy", the same goes for tqdm with "pip install tqdm"

tqdm has not been used to perform any calculations, just enclosed the for loops that are training the models to easily track progress. If you do not wish to use tqdm but still want to run the code, then simply remove the function from the batch_training() function.

In [24]:
import numpy as np
from math import exp
from random import random
from tqdm import tqdm

In [104]:
def forward_ReLU(x):
    # found on stack overflow
    return np.maximum(0, x)

def backward_ReLU(x):
    # found on stack overflow
    return (x > 0)

def forward_softmax(x):
    max = np.max(x, axis=0, keepdims=True)
    exp = np.exp(x - max)
    sum = np.sum(e_x, axis=0, keepdims=True)
    return exp / sum

In [26]:
class Embedding_Layer:
    """Fully Connected Layer without bias.
       input x weights -> output"""

    def __init__(self, n_input, n_output, seed=1):
        ran = np.random
        ran.seed(seed)
        self.weights = ran.random_sample((n_output, n_input))
        
    def feedforward(self, x):
        return np.matmul(self.weights, x.T)


In [105]:
class Fully_Connected_Layer:
    """Fully Connected Layer with bias.
       input x weights + bias -> output"""
    
    def __init__(self, n_input, n_output, seed=1):
        ran = np.random
        ran.seed(seed)
        self.weights= ran.random_sample((n_output, n_input))
        self.bias = ran.random_sample(n_output)
    
    def feedforward(self, x):
        x_w = np.matmul(self.weights, x.T)
        x_w_b = x_w + self.bias
        return x_w_b


In [107]:
class Neural_Network:
    """Neural Network with 3 layers, embedding, hidden and output"""

    # initialize all necessary informatio
    def __init__(self, n_vocab, n_embeddings, learning_rate=0.01, n_prev_words=3):
        
        # information about the model
        self.embedding_layer = Embedding_Layer(n_vocab, n_embeddings)
        self.hidden_layer = Fully_Connected_Layer(n_embeddings * n_prev_words, n_embeddings)
        self.output_layer = Fully_Connected_Layer(n_embeddings, n_vocab)
        self.alpha = learning_rate
        
        # z-values representing the results of each layer
        # before activating them, generally means when
        # the input has benn mutiplied by its weights
        self.embedding_z = None
        self.hidden_z =  None
        self.output_z = None
        
        # activation of each layer
        self.embedding_activation = None
        self.hidden_activation = None
        self.output_activation = None
        
        # information about the data
        self.n_vocab = n_vocab
        self.n_embeddings = n_embeddings
        self.n_prev_words = n_prev_words

    # updates layers forward
    def _feedforward(self, x):
        """Method from 7.5.1 in Speech and Language Processing.
           Step by step commented for easy comparison."""
        
        # create embedding matrix
        e = np.array([self.embedding_layer.feedforward(x[i]) for i in range(self.n_prev_words)])
        
        # create embedding layer e
        self.embedding_z = np.concatenate(e, axis=0)
        self.embedding_activation = self.embedding_z
        # Multiply by W and pass through ReLU activation function
        self.hidden_z = self.hidden_layer.feedforward(self.embedding_activation)
        self.hidden_activation = forward_ReLU(self.hidden_z)
        
        # Multiply by U and apply softmax reshaping it into |Vocabulary| x 1
        self.output_z = self.output_layer.feedforward(self.hidden_activation)
        self.output_activation = forward_softmax(self.output_z).reshape(1, self.n_vocab)

    def _backprop(self, x, y):
        
        # Find the bias of the output layer
        # derivative of cross entropy: y_pred - y
        self.dL_db_output = (self.output_activation - y) * self.output_activation
        self.dL_dw_output = np.matmul(self.dL_db_output.T, self.hidden_activation.reshape(1, -1))

        output_sum = np.sum(np.matmul(self.dL_db_output, self.output_layer.weights), axis=1, keepdims=True)
        self.hidden_bReLU = backward_ReLU(self.hidden_z)
        self.dL_db_hidden = np.matmul(output_sum.reshape(-1, 1), self.hidden_bReLU.reshape(1, -1))
        self.dL_dw_hidden = np.matmul(self.dL_db_hidden.T, self.embedding_activation.reshape(1, -1))
        
        hidden_sum = np.sum(np.matmul(self.dL_db_hidden, self.hidden_layer.weights), axis=1, keepdims=True)
        delta = np.matmul(hidden_sum.reshape(-1, 1), self.embedding_z.reshape(1, -1))
        q = int(delta.shape[1] / self.n_prev_words)
        self.dL_db_embedding = np.array([delta[:, i: i+q] for i in range(self.n_prev_words)]).reshape(self.n_prev_words, self.n_embeddings)
        self.dL_dw_embedding = np.array([np.matmul(self.dL_db_embedding[i].reshape(-1, 1), x[i].reshape(1, -1)) for i in range(self.n_prev_words)])
        
        # finally, update the weights with the new loss gradients
        self._update_weights()
    
    def _update_weights(self):
        # clip embeddings to avoid overflow due to normalizing not working
        
        # update all the embedding weights per previous word
        for i in range(self.n_prev_words):
            self.embedding_layer.weights -= self.alpha * np.clip(self.dL_dw_embedding[i], -1, 1)
            
        # update hidden layer weights and bias
        self.hidden_layer.weights -= self.alpha * np.clip(self.dL_dw_hidden, -1, 1)
        self.hidden_layer.bias -= self.alpha * np.clip(self.dL_db_hidden.flatten(), -1, 1)
        
        # update output layer weights and bias
        self.output_layer.weights -= self.alpha * np.clip(self.dL_dw_output, -1, 1)
        self.output_layer.bias -= self.alpha * np.clip(self.dL_db_output.flatten(), -1, 1)

    def fit(self, x, y, n_epochs=1):
        self._feedforward(x)
        self._backprop(x, y)
            
        return self.output_activation
        
    def predict(self, x):
        self._feedforward(x)
        return self.output_layer_activation


In [71]:
with open("metamorphosis.txt",'r',encoding='utf-8') as file:
    raw_metamorphosis = file.read()
    
replacements = ['\n', '\t', '\r', ';', ':', '.', ',', '"', "!", '?', '-', "'", '  ']

stripped_metamorphosis = raw_metamorphosis

for rep in replacements:
    stripped_metamorphosis = stripped_metamorphosis.replace(rep, ' ')

stripped_metamorphosis = stripped_metamorphosis.lower().split()

metamorphosis_vocab = list(set(stripped_metamorphosis))
meta_vocab_dic = {}
one_hots = {}
for i, word in enumerate(metamorphosis_vocab):
    one_hot_vector = np.zeros(len(metamorphosis_vocab))
    one_hot_vector[i] = 1
    meta_vocab_dic[word] = one_hot_vector

print(len(stripped_metamorphosis))
print(len(meta_vocab_dic))

21935
3787


In [72]:
with open("frankenstein.txt",'r',encoding='utf-8') as file:
    raw_frankenstein = file.read()
    
replacements = ['\n', '\t', '\r', ';', ':', '.', ',', '"', '  ']

stripped_frankenstein = raw_frankenstein

for rep in replacements:
    stripped_frankenstein = stripped_frankenstein.replace(rep, ' ')

stripped_frankenstein = stripped_frankenstein.lower().split()

frankenstein_vocab = list(set(stripped_frankenstein))
frank_vocab_dic = {}
one_hots = {}
for i, word in enumerate(frankenstein_vocab):
    one_hot_vector = np.zeros(len(frankenstein_vocab))
    one_hot_vector[i] = 1
    frank_vocab_dic[word] = one_hot_vector

print(len(stripped_frankenstein))
print(len(frank_vocab_dic))

75258
7731


In [73]:
def batch_training(model, dataset, vocab, batch_size=5000, n_prev_words=3):
    """Method that trains a model, its dataet and vocabulary in batches to
    avoid memory overflow and give feedback during the training"""
    n_vocab = len(vocab)
    n_dataset = len(dataset) - n_prev_words
    n_batches = n_dataset // batch_size
    
    for batch in range(n_batches):
        batch_len = batch_size - n_prev_words
        data = np.zeros((batch_len, n_prev_words, n_vocab), dtype='float16')
        labels = np.zeros((batch_len, n_vocab), dtype='int16')
        for i in range(batch_size - n_prev_words):
            x = np.zeros((n_prev_words, n_vocab))
            for j in range(n_prev_words):
                x[j] = vocab[dataset[(batch*batch_size):(batch*batch_size)+batch_size][i+j]]
            data[i] = x
            labels[i] = vocab[dataset[(batch*batch_size):(batch*batch_size)+batch_size][i+n_prev_words]]

        for x, y in tqdm(zip(data, labels)):
            result = model.fit(x, y)
        print(result)

In [75]:
metamorphosis_model = Neural_Network(len(meta_vocab), 64)
batch_training(metamorphosis_model, stripped_metamorphosis, meta_vocab_dic)

frankenstein_model = Neural_Network(len(frank_vocab), 64)
batch_training(frankenstein_model, stripped_frankenstein, frank_vocab_dic, batch_size=7000)


4997it [00:41, 121.56it/s]


[[3.71671400e-04 5.27986198e-04 2.19107774e-04 ... 7.90705511e-05
  2.25356894e-04 1.84983837e-04]]


4997it [00:40, 122.44it/s]


[[0.00016004 0.00037332 0.00025176 ... 0.00026948 0.00016239 0.00016003]]


4997it [00:40, 122.97it/s]


[[0.00016004 0.00037332 0.00025176 ... 0.00026948 0.00016239 0.00016003]]


4997it [00:40, 123.06it/s]


[[0.00018657 0.00018021 0.00023026 ... 0.00039038 0.00016607 0.00015376]]


4997it [01:22, 60.32it/s]


[[1.33815302e-04 1.51621135e-04 1.28574313e-04 ... 1.19572633e-04
  7.29615087e-05 1.39132604e-04]]


4997it [01:20, 62.00it/s]


[[1.46393285e-04 1.77424015e-04 1.12181759e-04 ... 8.16094752e-05
  8.91887916e-05 1.27934529e-04]]


4997it [01:22, 60.76it/s]


[[1.33965349e-04 2.25824097e-04 7.14055886e-05 ... 1.65125650e-04
  4.82831056e-05 2.09951696e-04]]


4997it [01:22, 60.65it/s]


[[1.13306035e-04 1.69470343e-04 1.37428630e-04 ... 9.39223239e-05
  8.39238739e-05 1.26451152e-04]]


4997it [01:25, 58.52it/s]


[[2.45335323e-04 1.81998995e-04 1.14288060e-04 ... 1.34716901e-04
  9.17479814e-05 1.51615043e-04]]


4997it [01:22, 60.22it/s]


[[1.74418380e-04 2.91943328e-04 1.95299861e-04 ... 1.02458548e-04
  9.44232257e-05 1.53354545e-04]]


4997it [01:26, 57.92it/s]


[[8.12610816e-05 1.94709924e-04 1.07509014e-04 ... 1.16132528e-04
  1.19979862e-04 9.79643081e-05]]


4997it [01:22, 60.58it/s]


[[1.34284751e-04 1.86593882e-04 1.22449251e-04 ... 8.94645863e-05
  9.21814890e-05 1.33461874e-04]]


4997it [02:21, 35.33it/s]


[[1.24531923e-04 2.08649219e-04 1.20693881e-04 ... 1.03644450e-04
  7.38578216e-05 1.26833850e-04]]


4997it [02:29, 33.52it/s]


[[1.35284453e-04 1.74703729e-04 1.24480507e-04 ... 9.51422783e-05
  8.90350539e-05 1.40367589e-04]]


4997it [02:30, 33.29it/s]


[[1.35861156e-04 1.83353042e-04 1.21952736e-04 ... 9.31779847e-05
  9.46728620e-05 1.37208885e-04]]


4997it [01:37, 51.48it/s]


[[1.35284270e-04 1.74703818e-04 1.24480369e-04 ... 9.51428480e-05
  8.90350206e-05 1.40367393e-04]]


4997it [01:22, 60.93it/s]


[[1.33216161e-04 1.52835155e-04 1.01937941e-04 ... 8.10761386e-05
  1.03796997e-04 1.21894438e-04]]


4997it [01:18, 63.91it/s]


[[1.18605101e-04 1.70497374e-04 1.10207821e-04 ... 1.14392056e-04
  9.16403199e-05 1.65899668e-04]]


4997it [01:20, 62.05it/s]

[[1.35284167e-04 1.74703925e-04 1.24480159e-04 ... 9.51430482e-05
  8.90350481e-05 1.40367649e-04]]





## Store the embeddings in TSV files
This is done so we get files that we can use in the Embedding Projector

In [102]:
out_metamorphosis_v = open('vectors_m.tsv', 'w', encoding='utf-8')
out_metamorphosis_m = open('metadata_m.tsv', 'w', encoding='utf-8')

for index, word in enumerate(metamorphosis_vocab):
    # get the index of all the dimensions to form a single vector
    out_metamorphosis_v.write('\t'.join([str(x) for x in metamorphosis_model.embedding_layer.weights[:, index]]) + "\n")
    out_metamorphosis_m.write(word + "\n")
out_metamorphosis_v.close()
out_metamorphosis_m.close()

(64, 3787)
3787 3787


In [103]:
out_frankenstein_v = open('vectors_f.tsv', 'w', encoding='utf-8')
out_frankenstein_m = open('metadata_f.tsv', 'w', encoding='utf-8')

for index, word in enumerate(frankenstein_vocab):
    out_frankenstein_v.write('\t'.join([str(x) for x in frankenstein_model.embedding_layer.weights[:, index]]) + "\n")
    out_frankenstein_m.write(word + "\n")
out_frankenstein_v.close()
out_frankenstein_m.close()