## 1.2 Language Models
* Adapt your implementation of logistic regression from Assignment 2 to the implementation of a feedforward neural network (or create a new implementation from scratch). If you use an online implementation as a reference state that in your report and the modifications you have made.
* Your neural network should predict upcoming words from prior word context (see Section 7.5 of the reference book).

In [19]:
import numpy as np
from math import exp
from random import random


In [20]:
def forward_ReLU(x):
    return np.maximum(0, x)

def backward_ReLU(x):
    # found on stack overflow
    return (x > 0) * 1

def forward_softmax(z):
    return np.array([(exp(zi)/sum([exp(zj) for zj in z])) for zi in z])

In [21]:
class Embedding_Layer:
    """Fully Connected Layer without bias.
       input x weights -> output"""

    def __init__(self, n_input, n_output, seed=1):
        ran = np.random
        ran.seed(seed)
        self.weights = ran.random_sample((n_output, n_input))
        
    def feedforward(self, x):
        return np.matmul(self.weights, x.T)


In [22]:
class Fully_Connected_Layer_Bias:
    """Fully Connected Layer with bias.
       input x weights + bias -> output"""
    
    def __init__(self, n_input, n_output, seed=1):
        ran = np.random
        ran.seed(seed)
        self.weights= ran.random_sample((n_output, n_input))
        self.bias = ran.random_sample(n_output)
    
    def feedforward(self, x):
        print(x.shape, self.weights.shape)
        x_w = np.matmul(self.weights, x.T)
        x_w_b = x_w + self.bias
        return x_w_b


In [54]:
# 2 layer Neural Network
class Neural_Network:

    # initialize all necessary informatio
    def __init__(self, n_vocab, n_embeddings, learning_rate=0.1, n_prev_words=3):
        
        # information about the model
        self.embedding_layer = Embedding_Layer(n_vocab, n_embeddings)
        self.hidden_layer = Fully_Connected_Layer_Bias(n_embeddings * n_prev_words, n_embeddings)
        self.output_layer = Fully_Connected_Layer_Bias(n_embeddings, n_vocab)
        self.gradients= {}
        self.alpha = learning_rate
        
        # z-values representing the results of each layer
        # before activating them, generally means when
        # the input has benn mutiplied by its weights
        self.embedding_z = None
        self.hidden_z =  None
        self.output_z = None
        
        self.embedding_activation = None
        self.hidden_activation = None
        self.output_activation = None
        
        # information about the data
        # is updated when 
        self.vocab = None
        self.one_hot_vectors = None
        self.n_vocab = n_vocab
        self.n_embeddings = n_embeddings
        self.n_prev_words = n_prev_words

    # Maybe use for prediction and encoding data
    def _create_vocab(self, x):
        new_vocab = []
        for word in x:
            if word not in new_vocab:
                new_vocab.append(x)
        self.vocab = new_vocab
        self.one_hot_vectors = [np.zeros(len(self.vocab)) for _ in range(len(self.vocab))]
        for i in range(len(self.vocab)):
            self.one_hot_vectors[i][i] = 1  

    # updates layers forward
    def _feedforward(self, x):
        
        self.x = x
        """Method from 7.5.1 in Speech and Language Processing.
           Step by step commented for easy comparison."""
        e = np.array([self.embedding_layer.feedforward(x[i]) for i in range(self.n_prev_words)])
        # create embedding layer e
        self.embedding_z = np.concatenate(e, axis=0)
        self.embedding_activation = self.embedding_z
        # Multiply by W and pass through ReLU activation function
        self.hidden_z = self.hidden_layer.feedforward(self.embedding_activation)
        self.hidden_activation = forward_ReLU(self.hidden_z)
        
        # Multiply by U and apply softmax reshaping it into |Vocabulary| x 1
        self.output_z = self.output_layer.feedforward(self.hidden_activation)
        self.output_activation = forward_softmax(self.output_z).reshape(1, self.n_vocab)

    def _backprop(self, y):
        # step by step what to dodo
        
        # Find the bias of the output layer
        # derivative of cross entropy: y_pred - y
        self.dL_db_output = (self.output_activation - y) * self.output_activation
        self.dL_dw_output = np.matmul(self.dL_db_output.T, self.hidden_activation.reshape(1, -1))

        output_sum = np.sum(np.matmul(self.dL_db_output, self.output_layer.weights), axis=1, keepdims=True)
        self.hidden_bReLU = backward_ReLU(self.hidden_z)
        self.dL_db_hidden = np.matmul(output_sum.reshape(-1, 1), self.hidden_bReLU.reshape(1, -1))
        self.dL_dw_hidden = np.matmul(self.dL_db_hidden.T, self.embedding_activation.reshape(1, -1))
        
        hidden_sum = np.sum(np.matmul(self.dL_db_hidden, self.hidden_layer.weights), axis=1, keepdims=True)
        delta = np.matmul(hidden_sum.reshape(-1, 1), self.embedding_z.reshape(1, -1))
        q = int(delta.shape[1] / self.n_prev_words)
        self.dL_db_embedding = np.array([delta[:, i: i+q] for i in range(self.n_prev_words)]).reshape(self.n_prev_words, self.n_embeddings)
        self.dL_dw_embedding = np.array([np.matmul(self.dL_db_embedding.reshape(-1, 1), self.x[i].reshape(1, -1)) for i in range(self.n_prev_words)])
        
        # finally, update the weights with the new loss gradients
        self._update_weights()
    
    def _update_weights(self):
        # update all the embedding weights
        for i in range(self.n_prev_words):
            self.embedding_layer.weights -= self.alpha * self.dL_dw_embedding[i]
        # update hidden layer
        self.hidden_layer.weights -= self.alpha * self.dL_dw_hidden
        self.hidden_layer.bias -= self.alpha * self.dL_db_hidden
        # update output layer
        self.output_layer.weights -= self.alpha * self.dL_dw_output
        self.output_layer.bias -= self.alpha * self.dL_db_output

    def fit(self, x, y, n_epochs=1):
        for x, y in zip(x, y):
            self._feedforward(x)
            y_one_hot = np.zeros(self.n_vocab)
            y_one_hot[y] = 1
            self._backprop(y_one_hot)
            
        return self.output_activation
        
    def predict(self, x):
        # Use logistic regression
        print("TODO")


In [55]:
data = """I've seen what I become, and I cannot let that happen. And for this, you join him? Your destiny can
        change just as quickly as the love in one's heart can fade. Nothing is set in stone. But I will cause
        so much pain. If there is to be balance, what you have seen must be forgotten. If you're not with me,
        then you're my enemy. Only a Sith deals in absolutes. I will do what I must. You will try. It's over,
        Anakin! I have the high ground. You underestimate my power. Don't try it!"""

x = np.array([[[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1]],
                      [[0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]],
                      [[1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]],
                      [[0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]]])

y = [0, 1, 2, 3, 4, 5]

model = Neural_Network(6, 2)
result = model.fit(x, y)
print(result)


(6,) (2, 6)
(2,) (6, 2)


ValueError: operands could not be broadcast together with shapes (2,6) (6,6) (2,6) 