## 1.2 Language Models
* Adapt your implementation of logistic regression from Assignment 2 to the implementation of a feedforward neural network (or create a new implementation from scratch). If you use an online implementation as a reference state that in your report and the modifications you have made.
* Your neural network should predict upcoming words from prior word context (see Section 7.5 of the reference book).

In [11]:
import numpy as np
from math import exp
from random import random


In [123]:
def cross_entropy(self, y, y_pred):
    return -y * math.log(y_pred) + (1 - y) * math.log(y_pred)

def forward_ReLU(x):
    return np.maximum(0, x)

def backward_ReLU(x):
    return max(np.ones(x.shape), 0)

def forward_sigmoid(x):
    return 1 / (1 + np.exp(-x))

# derivitive of sigmoid
def backward_sigmoid(x):
    return forward_sigmoid(x) * (1 - forward_sigmoid(x))

def forward_softmax(z):
    return [(exp(zi)/sum([exp(zj) for zj in z])) for zi in z]

In [139]:
class Embedding_Layer:
    """Fully Connected Layer without bias.
       input x weights -> output"""

    def __init__(self, n_input, n_output, seed=1):
        ran = np.random
        ran.seed(seed)
        self.weights = ran.random_sample((n_output, n_input))
        
    def feedforward(self, x):
        return np.matmul(self.weights, x.T)


In [140]:
class Linear_Layer:
    """Connected layer with bias.
       input x weights + bias -> output"""
    
    def __init__(self, n_input, n_output, seed=1):
        ran = np.random
        ran.seed(seed)
        self.weights= ran.random_sample((n_output, n_input))
        self.bias = ran.random_sample(n_output)
    
    def feedforward(self, x):
        print(x.shape, self.weights.shape)
        x_w = np.matmul(self.weights, x.T)
        x_w_b = x_w + self.bias
        return x_w_b


In [141]:
# 2 layer Neural Network
class Neural_Network:

    # initialize all necessary informatio
    def __init__(self, n_vocab, n_embeddings, learning_rate=0.1, n_prev_words=3):
        
        # information about the model
        self.embedding_layer = Embedding_Layer(n_vocab, n_embeddings)
        self.hidden_layer = Linear_Layer(n_embeddings * n_prev_words, n_embeddings)
        self.output_layer = Linear_Layer(n_embeddings, n_vocab)
        self.gradients= {}
        self.alpha = learning_rate
        
        # z-values representing the results of each layer
        # before activating them, generally means when
        # the input has benn mutiplied by its weights
        self.embedding_z = None
        self.hidden_z =  None
        self.output_z = None
        
        self.embedding_activation = None
        self.hidden_activation = None
        self.output_activation = None
        
        # information about the data
        # is updated when 
        self.vocab = None
        self.one_hot_vectors = None
        self.n_vocab = n_vocab
        self.n_prev_words = n_prev_words

    # Maybe use for prediction and encoding data
    def _create_vocab(self, x):
        new_vocab = []
        for word in x:
            if word not in new_vocab:
                new_vocab.append(x)
        self.vocab = new_vocab
        self.one_hot_vectors = [np.zeros(len(self.vocab)) for _ in range(len(self.vocab))]
        for i in range(len(self.vocab)):
            self.one_hot_vectors[i][i] = 1  

    # updates layers forward
    def feedforward(self, x):
        """Method from 7.5.1 in Speech and Language Processing.
           Step by step commented for easy comparison."""
        e = np.array([self.embedding_layer.feedforward(x[i, :]) for i in range(self.n_prev_words)])
        print("E shape: ", e.shape)
        # create embedding layer e
        self.embedding_z = np.concatenate(e, axis=0)
        print("E concat shape: ", self.embedding_z)
        self.embedding_activation = self.embedding_z
        # Multiply by W and pass through ReLU activation function
        self.hidden_z = self.hidden_layer.feedforward(self.embedding_activation)
        self.hidden_activation = forward_ReLU(self.hidden_z)
        
        # Multiply by U and apply softmax reshaping it into |Vocabulary| x 1
        self.output_z = self.output_layer.feedforward(self.hidden_activation)
        self.output_activation = forward_softmax(self.output_z).reshape(1, self.n_vocab)

    def backprop(self, y):
        # step by step what to dodo
        
        #Find the cross entropy loss of the output layer
        output_delta = cross_entropy(y, self.output_activation)
        
        # Reshape this to fit the hidden layer results
        self.output_loss_weights = np.matmul(output_delta, self.hidden_activation.reshape(1, -1))
        
        
        # finally, update the weights with the new loss gradients
        self._update_weights()
        
        
    def _gradient(self, x, y):
        y_pred = self.predict(x)
        loss = y_pred - y
        gradients = [x[i] * loss for i in range(len(x))]
        return gradients, loss
    
    def _update_weights(self, x, y):
        for i in range(self.n_prev_words):
            self.embedding_layer.weights -= self.alpha * self.embedding_loss_weights[i]
        self.hidden_layer.weights -= self.alpha * self.hidden_loss_weights
        self.hidden_layer.bias -= self.alpha * self.hidden_loss_bias
        self.output_layer.weights -= self.alpha * self.output_loss_weights
        self.output_layer.bias -= self.alpha * self.output_loss_bias

    def fit(self, x, n_epochs=1):
        #self._create_vocab(x)
        
        y_pred = self.feedforward(x)
            
        return self.output_activation
        
    def predict(self, x):
        # Use logistic regression
        print("TODO")


In [145]:
data = """I've seen what I become, and I cannot let that happen. And for this, you join him? Your destiny can
        change just as quickly as the love in one's heart can fade. Nothing is set in stone. But I will cause
        so much pain. If there is to be balance, what you have seen must be forgotten. If you're not with me,
        then you're my enemy. Only a Sith deals in absolutes. I will do what I must. You will try. It's over,
        Anakin! I have the high ground. You underestimate my power. Don't try it!"""

x = np.array([[[0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1]],
                      [[0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]],
                      [[1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]],
                      [[0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]]])

model = Neural_Network(6, 2)
result = model.fit(x)
print(result)


E shape:  (3, 2, 3)
E concat shape:  [[7.20324493e-01 1.14374817e-04 9.23385948e-02]
 [3.45560727e-01 3.96767474e-01 6.85219500e-01]
 [3.02332573e-01 9.23385948e-02 9.23385948e-02]
 [5.38816734e-01 6.85219500e-01 6.85219500e-01]
 [4.17022005e-01 1.46755891e-01 9.23385948e-02]
 [1.86260211e-01 4.19194514e-01 6.85219500e-01]]
(6, 3) (2, 6)


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 3 is different from 6)