
## The implementation of Ngram Language Model with Logistic Regression


In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [6]:
!ls /content/gdrive/Shared\ drives/CISC6210NLPFall19/ExampleCode/bigram_LR/*.py

'/content/gdrive/Shared drives/CISC6210NLPFall19/ExampleCode/bigram_LR/brown.py'
'/content/gdrive/Shared drives/CISC6210NLPFall19/ExampleCode/bigram_LR/LoR_bigram.py'


In [0]:
import sys
sys.path.append('/content/gdrive/Shared drives/CISC6210NLPFall19/ExampleCode/bigram_LR')

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import random
from datetime import datetime

import brown

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> brown
    Downloading package brown to /root/nltk_data...
      Unzipping corpora/brown.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


In [0]:
def softmax(a):
    a = a-a.max() # to avoid numerical overflow
    exp_a = np.exp(a)
    return exp_a/exp_a.sum(axis=1, keepdims=True)

In [0]:
#plot a smoothed losses line to reduce variability
def smoothed_loss(x, decay=0.99):
    y = np.zeros(len(x))
    last=0
    for t in range(len(x)):
      z = decay*last+ (1-decay)*x[t]
      y[t]= z/(1-decay**(t+1))
      last = z
    return y
        

In [0]:
 def main():
    
    #training set: 2000 words, 2000 sentences.
    sentences, word2idx = brown.get_sentences_with_word2idx_limit_vocab(2000,2000)
    #for testing
    #sentences, word2idx = brown.get_sentences_with_word2idx_limit_vocab(10,10)
    V = len(word2idx)
    print(f"word total: {V}")
    start_idx= word2idx['START']
    end_idx = word2idx['END']
    print(f'Start index={start_idx} and End index = {end_idx}')
    
   
    #train a logistic model
    
    for lr in [0.1, 0.05, 0.02, 0.01, 0.001]:
      
      W = np.random.randn(V,V)/np.sqrt(V) 
      #initial random values to W of shape V x V
      #print(f'W\n{W}')
    
      losses=[]
      epochs = 10
      #lr = 1e-2
      print(f"Learing Rate is {lr}:")
      t0=datetime.now()
      for epoch in range(epochs):
        print(f"In iteration NO.{epoch}")
        #suffle sentences each epoch
        random.shuffle(sentences) 
        
        j=0 #sentence counter
        for sentence in sentences:
            #convert sentence into one-hot coded inputs and targets
            sentence=[start_idx]+sentence+[end_idx] #pad with start and end tag
            #print(sentence)
            n = len(sentence)
            #print(f"Length of sentence {n}")
            
            # for each sentence of length n, there are n-1 bigrams
            inputs = np.zeros((n-1,V))
            # all first words of bigrams in the sentence
            targets = np.zeros((n-1,V))
            # all second words of bigrams in the sentence
            inputs[np.arange(n-1), sentence[:n-1]]=1
            #the sentence itself, ignoring the end index, shape n-1 x V
            targets[np.arange(n-1), sentence[1:]]=1
            #the next word of the target, shape n-1 x V
            # one-hot encoding of word vectors         
            #print (f'inputs:\n{inputs.shape}')
            #print(f'targets:\n{targets.shape}')
            
            #get output prediction
            #since we are using one-hot encoding, bias term is ignored.
            #p(y|x)
            predictions = softmax(inputs.dot(W)) #shape n-1 x V
            #print(f"Shape of predictions after softmax {predictions.shape}")#one for each word in the sentence
            #print(f"predictions:\n{predictions}")
            #do a gradient descent step
            #we perform a Mini-batch Gradient Descent algorithm
            d = inputs.T.dot(predictions-targets)
            W = W - lr*d 
            
            #keep track of the loss - cross entropy cost function, average loss for each sample            
            loss = -np.sum(targets*np.log(predictions))/(n-1)#array multiplication
            losses.append(loss)
            
            
                                   
            if j%500==0:
                print(f"epoch: {epoch}, sentence: {j}/{len(sentences)}, loss: {loss}")
            
            j+=1
            
        
        print(f"Elapsed time training: {datetime.now()-t0}")
      plt.plot(losses)       
      plt.plot(smoothed_loss(losses))
      plt.show()

In [0]:
if __name__ == '__main__':
    main()

Finish reading brown sentences
finished all sentences and build index
word total: 2001
Start index=0 and End index = 1
Learing Rate is 0.1:
In iteration NO.0
epoch: 0, sentence: 0/2000, loss: 7.599008997760336
epoch: 0, sentence: 500/2000, loss: 6.171957576435965
epoch: 0, sentence: 1000/2000, loss: 4.892173366817531
epoch: 0, sentence: 1500/2000, loss: 7.794433794352203
Elapsed time training: 0:01:50.730097
In iteration NO.1
epoch: 1, sentence: 0/2000, loss: 6.025639437386274
epoch: 1, sentence: 500/2000, loss: 5.628708206556994
epoch: 1, sentence: 1000/2000, loss: 6.155335745686917
epoch: 1, sentence: 1500/2000, loss: 4.025352857706961
Elapsed time training: 0:03:39.501018
In iteration NO.2
epoch: 2, sentence: 0/2000, loss: 5.357859139694355
epoch: 2, sentence: 500/2000, loss: 4.952859658221779
epoch: 2, sentence: 1000/2000, loss: 4.253707545673519
epoch: 2, sentence: 1500/2000, loss: 5.265011888659899
Elapsed time training: 0:05:27.210212
In iteration NO.3
epoch: 3, sentence: 0/2000