## General setting (data)

In [61]:
# import
import os
import json
try: 
  import jsonlines
except ImportError:
  print("Installing the package, RESTART THIS CELL")
  !pip install jsonlines
try:
  from tqdm import tqdm
except ImportError:
  print("Installing the package, RESTART THIS CELL")
  !pip install tqdm

import shutil
import numpy as np
from collections import defaultdict

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# data
def load_wikitext(filename="wikitext2-sentencized.json"):
  if not os.path.exists(filename):
    !wget "https://nyu.box.com/shared/static/9kb7l7ci30hb6uahhbssjlq0kctr5ii4.json" -O $filename
  
  datasets=json.load((open(filename,'r')))
  for name in datasets:
    datasets[name]=[x.split() for x in datasets[name]]
  vocab=list(set([t for ts in datasets["train"] for t in ts]))
  print("Vocab size: %d"%(len(vocab)))
  return datasets,vocab

In [63]:
datasets,vocab=load_wikitext()

Vocab size: 33175


In [64]:
datasets["train"][0]

['Valkyria',
 'of',
 'the',
 'Battlefield',
 '3',
 ')',
 ',',
 'commonly',
 'referred',
 'to',
 'as',
 'Valkyria',
 'Chronicles',
 'III',
 'outside',
 'Japan',
 ',',
 'is',
 'a',
 'tactical',
 'role',
 '@-@',
 'playing',
 'video',
 'game',
 'developed',
 'by',
 'Sega',
 'and',
 'Media.']

In [0]:
def perplexity(model,sequences):
  n_total=0
  logp_total=0
  for sequence in sequences:
    logp_total+=model.sequence_logp(sequence)
    n_total+=len(sequence)+1
  ppl=2**(-(1.0/n_total)*logp_total)
  return ppl

## Addictive smoothing

In [0]:
class NGramAddictive():
  def __init__(self,n,delta,vocab):
    self.n=n # n-gram
    self.delta=delta # pseudo-count
    self.count=defaultdict(lambda: defaultdict(float))
    self.total=defaultdict(float)
    self.vsize=len(vocab)+1 # +1 is for <eos>, but <bos> will not appear in word 

  def estimate(self,sequences):
    for sequence in sequences:
      padded_sequence=["<bos>"]*(self.n-1)+sequence+["<eos>"]
      for i in range(len(padded_sequence)-self.n+1):
        ngram= tuple(padded_sequence[i:i+self.n])
        prefix,word=ngram[:-1],ngram[-1]
        self.count[prefix][word]+=1
        self.total[prefix]+=1
  def sequence_logp(self,sequence):
    padded_sequence=["<bos>"]*(self.n-1)+sequence+["<eos>"]
    total_logp=0
    for i in range(len(padded_sequence)-self.n+1):
      ngram=tuple(padded_sequence[i:i+n])
      prefix,word=ngram[:-1],ngram[-1]
      logp=(self.count[prefix][word]+self.delta)/(self.total[prefix]+self.vsize*self.delta)
      total_logp+=np.log2(logp)
    return total_logp

In [0]:
delta=0.0005
for n in [2,3,4]:
  lm=NGramAddictive(n,delta,vocab)
  lm.estimate(datasets["train"])
  print("Baseline (Additive smoothing, n=%d, delta=%.4f)) Train Perplexity: %3.f"% (n,delta,perplexity(lm,datasets["train"])))
  print("Baseline (Additive smoothing, n=%d, delta=%.4f)) Valid Perplexity: %.3f" % (n, delta, perplexity(lm, datasets['valid'])))

Baseline (Additive smoothing, n=2, delta=0.0005)) Train Perplexity:  90
Baseline (Additive smoothing, n=2, delta=0.0005)) Valid Perplexity: 525.825
Baseline (Additive smoothing, n=3, delta=0.0005)) Train Perplexity:  27
Baseline (Additive smoothing, n=3, delta=0.0005)) Valid Perplexity: 2577.128
Baseline (Additive smoothing, n=4, delta=0.0005)) Train Perplexity:  20
Baseline (Additive smoothing, n=4, delta=0.0005)) Valid Perplexity: 9570.901


## Interpolation

In [0]:
 class NGramInterpolation:
    def __init__(self,n,vocab,la):
     self.n=n
     self.vsize=len(vocab)+1
     self.la=la
     self.count1=defaultdict(float)
     self.count=defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
     self.total1=0
     self.total=defaultdict(lambda: defaultdict(float))

    def estimate(self,sequences):
      for j in range(1,self.n+1):
        for sequence in sequences:
          padded_sequence=["<bos>"]*(j-1)+sequence+["<eos>"]
          for i in range(len(padded_sequence)-j+1):
            ngram=tuple(padded_sequence[i:i+j])
            #if j==2:
             # print(j,ngram,i,len(padded_sequence))
            prefix,word=ngram[:-1],ngram[-1]
            if j==1: 
              self.count1[word]+=1
              self.total1+=1
            else:
              self.count[j][tuple(prefix)][word]+=1
              self.total[j][tuple(prefix)]+=1
    def sequence_logp(self,sequence):
      total_logp=0
      padded_sequence=["<bos>"]*(self.n-1)+sequence+["<eos>"]
      for i in range(len(padded_sequence) - self.n+1):
            ngram = tuple(padded_sequence[i:i+self.n])
            total_logp += np.log2(self.ngram_prob(ngram))
      return total_logp
  
    def ngram_prob(self,ngram):
      # p=la1*p(word|prefix)+la2*p(word|prefix[1:])
      p=(1/self.vsize)*self.la[0]
      for i in range(1,self.n+1): # i=2
        prefix,word=ngram[-i:-1],ngram[-1] # prefix=ngram[2,3]
        if i==1:
          p+=self.la[i]*self.count1[word]/self.total1
        else:
          p+=self.la[i]*self.count[i][tuple(prefix)][word]/max(self.total[i][tuple(prefix)],1) # p+=self.la[2]*self.count[2][]
      return p

In [0]:
def show_results(l,n):
  for la in l:
    print("\nlambda values are: ",la)
    lm=NGramInterpolation(n,vocab,la)
    lm.estimate(datasets["train"])

    print("Baseline (Interpolation, n=%d)) Train Perplexity: %.3f" % (n,  perplexity(lm, datasets['train'])))
    print("Baseline (Interpolation, n=%d)) Valid Perplexity: %.3f" % (n,  perplexity(lm, datasets['valid'])))

In [0]:
la2=[[1/3,1/3,1/3],
     [1/6,1/3,1/2],
     [1/6,1/2,1/3]]
la3=[[1/4,1/4,1/4,1/4],
     [1/10,1/5,3/10,2/5],
     [1/10,1/5,4/10,3/10]]
la4=[[1/5,1/5,1/5,1/5,1/5],
     [1/15,2/15,3/15,4/15,5/15],
     [1/15,2/15,5/15,4/15,3/15]]
show_results(la2,2)
show_results(la3,3)
show_results(la4,4)


lambda values are:  [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
Baseline (Interpolation, n=2)) Train Perplexity: 179.054
Baseline (Interpolation, n=2)) Valid Perplexity: 395.603

lambda values are:  [0.16666666666666666, 0.3333333333333333, 0.5]
Baseline (Interpolation, n=2)) Train Perplexity: 129.816
Baseline (Interpolation, n=2)) Valid Perplexity: 326.710

lambda values are:  [0.16666666666666666, 0.5, 0.3333333333333333]
Baseline (Interpolation, n=2)) Train Perplexity: 170.081
Baseline (Interpolation, n=2)) Valid Perplexity: 368.773

lambda values are:  [0.25, 0.25, 0.25, 0.25]
Baseline (Interpolation, n=3)) Train Perplexity: 23.326
Baseline (Interpolation, n=3)) Valid Perplexity: 326.292

lambda values are:  [0.1, 0.2, 0.3, 0.4]
Baseline (Interpolation, n=3)) Train Perplexity: 15.778
Baseline (Interpolation, n=3)) Valid Perplexity: 306.898

lambda values are:  [0.1, 0.2, 0.4, 0.3]
Baseline (Interpolation, n=3)) Train Perplexity: 18.541
Baseline (Interpolation, n=3

## Kenlm

In [0]:
from collections import Counter

In [0]:
class NGramkenlm:
  def __init__(self,n,vocab):
    self.n=n
    self.n_k=[0]*6
    self.D=[0]*5
    self.count=defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
    self.total=defaultdict(lambda: defaultdict(float))
    self.count1=defaultdict(float)
    self.total1=len(vocab)+1
    self.vsize=len(vocab)+1
  def estimate(self,sequences):
    for i in range(1,self.n+1):
      for sequence in sequences:
        padded_sequence=["<bos>"]*(i-1)+sequence+["<eos>"]
        for j in range(len(padded_sequence)-i+1):
          ngram=tuple(padded_sequence[j:i+j])
          prefix,word=ngram[:-1],ngram[-1]
          if i==1:
            self.count1[word]+=1
            self.total1+=1
          else:
            self.count[i][tuple(prefix)][word]+=1
            self.total[i][tuple(prefix)]+=1
    for t in range(1,5):
      for k,v in zip(self.total[self.n].keys(),self.total[self.n].values()):
        if int(v)==t:
          self.n_k[t]+=1
    self.Y=self.n_k[1]/(self.n_k[1]+2*self.n_k[2])
    for t in range(1,4):
      self.D[t]=t-self.Y*(t+1)*(self.n_k[t+1]/self.n_k[t]) 
    print(self.D)

  def sequence_logp(self,sequence):
    total_logp=0
    padded_sequence=["<bos>"]*(self.n-1)+sequence+["<eos>"]
    for i in range(len(padded_sequence)-self.n+1):
      ngram=tuple(padded_sequence[i:i+self.n])
      total_logp+=np.log2(self.ngram_p(ngram))
    return total_logp
  def ngram_p(self,ngram):
    for i in range(1,self.n+1):
      prefix,word=ngram[-i:-1],ngram[-1]
      if i==1:
        prob=self.count1[word]/self.total1
        if self.count1[word]==0:
          prob=1/(self.vsize)
      else:
        prob=self.update_p(i,prefix,word,prob)
    return prob

  def update_p(self,i,prefix,word,prob):
    c=self.count[i][tuple(prefix)][word]
    t=self.total[i][tuple(prefix)]
    N=[0]*4
    if c>0:
      prob=(c-self.D[int(c) if c<3 else 3])/max(t,1)
      return prob
    else:
      for k,v in zip(self.count[i][tuple(prefix)].keys(),self.count[i][tuple(prefix)].values()):
        N[int(v) if v<3 else 3]+=1
      sigma=(self.D[1]*N[1]+self.D[2]*N[2]+self.D[3]*N[3])/max(t,1)
      if sigma>0:
        prob=sigma*prob
      return prob

In [76]:
for n in [2,3,4]:
    lm2 = NGramkenlm(n,vocab)  # +1 is for <eos>
    lm2.estimate(datasets['train'])

    # print("The value of lambda: {}".format(l[n]))
    print("Baseline (Interpolation, n=%d)) Train Perplexity: %.3f" % (n,  perplexity(lm2, datasets['train'])))
    print("Baseline (Interpolation, n=%d)) Valid Perplexity: %.3f" % (n,  perplexity(lm2, datasets['valid'])))

[0, 0.10222222222222221, 0.41008100810080994, 2.7384814814814815, 0]
Baseline (Interpolation, n=2)) Train Perplexity: 105.160
Baseline (Interpolation, n=2)) Valid Perplexity: 346.055
[0, 0.7030687588555738, 1.1273862307842455, 1.4884132492931772, 0]
Baseline (Interpolation, n=3)) Train Perplexity: 18.874
Baseline (Interpolation, n=3)) Valid Perplexity: 311.165
[0, 0.8391219675767698, 1.2510035732353122, 1.4337382971822832, 0]
Baseline (Interpolation, n=4)) Train Perplexity: 11.262
Baseline (Interpolation, n=4)) Valid Perplexity: 341.428
