In [21]:
import numpy as np
import nltk
import re

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

# Text Preprocessing

In [73]:
# Define the tokenized corpus
with open('./data/shakespeare.txt') as f:
    data = f.read()
data = re.sub(r'[,!?;-]', '.',data)
data = nltk.word_tokenize(data)
data = [ch.lower() for ch in data if ch.isalpha() or ch == '.']    #  Lower case and drop non-alphabetical tokens

print("Number of tokens:", len(data),'\n', data[:15])

Number of tokens: 60976 
 ['o', 'for', 'a', 'muse', 'of', 'fire', '.', 'that', 'would', 'ascend', 'the', 'brightest', 'heaven', 'of', 'invention']


In [74]:
# Compute frequency distribution of words
fdist = nltk.FreqDist(word for word in data)

print("Size of vocabulary: ",len(fdist) )
print("Most frequent tokens: ",fdist.most_common(20) ) # print the 20 most frequent words and their freq.

Size of vocabulary:  5775
Most frequent tokens:  [('.', 9630), ('the', 1521), ('and', 1394), ('i', 1257), ('to', 1159), ('of', 1093), ('my', 857), ('that', 781), ('in', 770), ('a', 752), ('you', 748), ('is', 630), ('not', 559), ('for', 467), ('it', 460), ('with', 441), ('his', 434), ('but', 417), ('me', 417), ('your', 397)]


In [76]:
# Get 'word2Ind' and 'Ind2word' dictionaries for the tokenized corpus
def get_dict(tokenized_corpus):
    word_set = set(tokenized_corpus)
    word2Ind = {}
    Ind2word = {}

    for i, word in enumerate(word_set):
        word2Ind[word] = i
        Ind2word[i] = word
    
    return word2Ind, Ind2word

word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V, "\n")

# example of word to index mapping
print("Index of the word 'king' :  ",word2Ind['king'] )
print("Word which has index 2743:  ",Ind2word[2743] )

Size of vocabulary:  5775 

Index of the word 'king' :   1516
Word which has index 2743:   bare


In [135]:
# Define the 'get_windows' function for context words
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

# Define the 'word_to_one_hot_vector' function - returns 1D array
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

# Define the 'context_words_to_vector' function - returns 1D array
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vector = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vector = np.mean(context_words_vector, axis=0)
    return context_words_vector
    
# Define the generator function 'get_batches' - returns 2D array
def get_batches(words, C, word2Ind, V, batch_size):
    context_words_vectors = np.zeros((V, batch_size))
    one_hot_vectors = np.zeros((V, batch_size))

    size = 0
    for context_words, center_word in get_windows(words, C):
        context_words_vectors[:, size] = context_words_to_vector(context_words, word2Ind, V)
        one_hot_vectors[:, size] = word_to_one_hot_vector(center_word, word2Ind, V)
        
        size += 1
        if size == batch_size:
            size = 0
            yield context_words_vectors, one_hot_vectors

# Initialization of Weights & Biases

In [144]:
def initialize_model(N,V, random_seed=1):
    '''
    Inputs: 
        N:  dimension of hidden vector 
        V:  dimension of vocabulary
        random_seed: random seed for consistent results in the unit tests
    Outputs: 
        W1, W2, b1, b2: initialized weights and biases
    '''
    if random_seed!=None: np.random.seed(random_seed)

    # W1 has shape (N,V)
    W1 = np.random.rand(N,V)
    
    # W2 has shape (V,N)
    W2 = np.random.rand(V,N)
    
    # b1 has shape (N,1)
    b1 = np.random.rand(N,1)
    
    # b2 has shape (V,1)
    b2 = np.random.rand(V,1)
    
    return W1, W2, b1, b2

# Activation Functions

In [81]:
def relu(z1):
    h = np.maximum(z1, 0)
    return h

def softmax(z2):
    e = np.exp(z2)
    yhat = e / np.sum(e, axis=0)
    return yhat

# Forward Propagation

In [79]:
def forward_prop(x, W1, W2, b1, b2):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        W1, W2, b1, b2:  matrices and biases to be learned
    Outputs:
        h
        yhat: final prediction
    '''
    # Calculate h
    z1 = np.matmul(W1,x) + b1
    h = relu(z1)

    # Calculate yhat with h
    z2 = np.matmul(W2,h) + b2
    yhat = softmax(z2)

    return yhat, h

# Cross-Entropy Loss

In [83]:
def compute_cost(y, yhat, batch_size):
    cost = - 1/batch_size * np.sum(y * np.log(yhat))
    cost = np.squeeze(cost)
    return cost

# Backward Propagation

In [87]:
def backward_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        yhat: prediction (estimate of y)
        y:  target vector
        h:  hidden vector (see eq. 1)
        W1, W2, b1, b2:  matrices and biases  
        batch_size: batch size 
    Outputs: 
        grad_W1, grad_W2, grad_b1, grad_b2:  gradients of matrices and biases   
    '''

    yhat, h = forward_prop(x, W1, W2, b1, b2)
    
    # Compute l1 as ReLU(W2^T (Yhat - Y))
    l1 = relu(np.matmul(W2.T, yhat - y))
    
    # compute the gradient for W1
    grad_W1 = np.matmul(l1, x.T) / batch_size

    # Compute gradient of W2
    grad_W2 = np.matmul(yhat-y, h.T) / batch_size
    
    # compute gradient for b1
    grad_b1 = np.matmul(l1, np.ones((batch_size,1))) / batch_size

    # compute gradient for b2
    grad_b2 = np.matmul(yhat-y, np.ones((batch_size,1))) / batch_size
    
    return grad_W1, grad_W2, grad_b1, grad_b2

# Gradient Descent

In [150]:
def gradient_descent(data, word2Ind, N, num_iters, C=2, alpha=0.03, batch_size=128,
                    V=len(word2Ind), random_seed=None, initialize_model=initialize_model, 
                    get_batches=get_batches, forward_prop=forward_prop, 
                    compute_cost=compute_cost, backward_prop=backward_prop):

    '''
    Inputs: 
        data:      text
        word2Ind:  words to Indices
        N:         dimension of hidden vector  
        num_iters: number of iterations
        V:         dimension of vocabulary 
        random_seed: random seed to initialize the model's matrices and vectors
        initialize_model: your implementation of the function to initialize the model
        get_batches: function to get the data in batches
        forward_prop: your implementation of the function to perform forward propagation
        compute_cost: cost function (Cross entropy)
        back_prop: your implementation of the function to perform backward propagation
    Outputs: 
        W1, W2, b1, b2:  updated matrices and biases after num_iters iterations

    '''
    W1, W2, b1, b2 = initialize_model(N,V, random_seed=random_seed) # W1=(N,V) W2=(V,N)

    # To keep track of which iteration we're in
    iters = 0
    
    for x, y in get_batches(data, C, word2Ind, V, batch_size):
        # get yhat and h via forward_prop
        yhat, h = forward_prop(x, W1, W2, b1, b2)
        
        # get cost
        cost = compute_cost(y, yhat, batch_size)
        if ( (iters+1) % 10 == 0):
            print(f"iters: {iters + 1} cost: {cost:.6f}")
            
        # get gradients via backward_prop
        grad_W1, grad_W2, grad_b1, grad_b2 = backward_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size)
        
        # update weights and biases
        W1 = W1 - alpha*grad_W1
        W2 = W2 - alpha*grad_W2
        b1 = b1 - alpha*grad_b1
        b2 = b2 - alpha*grad_b2

        iters +=1 
        if iters == num_iters: 
            break
        if iters % 100 == 0:
            alpha *= 0.66
            
    return W1, W2, b1, b2

In [152]:
print("Call gradient_descent")
W1, W2, b1, b2 = gradient_descent(data, word2Ind, N=50, num_iters=150) 
# Changing num_iters will change how long the function takes to run

Call gradient_descent
iters: 10 cost: 9.519696
iters: 20 cost: 9.425947
iters: 30 cost: 9.323007
iters: 40 cost: 9.285315
iters: 50 cost: 9.157654
iters: 60 cost: 8.687861
iters: 70 cost: 8.602355
iters: 80 cost: 8.309799
iters: 90 cost: 8.355451
iters: 100 cost: 8.125669
iters: 110 cost: 8.315308
iters: 120 cost: 8.014888
iters: 130 cost: 7.956922
iters: 140 cost: 8.250067
iters: 150 cost: 8.212340
