# Language Modelling Lab (week 3)
This notebook provides a partial solution for week 3 of ANLE/P.


## 1 Getting Started

We need to get the names of files in the training directory and split them into training and testing 50:50.

In [1]:
import os,random,math
TRAINING_DIR="lab3resources/sentence-completion/Holmes_Training_Data"  #this needs to be the parent directory for the training corpus

def get_training_testing(training_dir=TRAINING_DIR,split=0.5):

    filenames=os.listdir(training_dir)
    n=len(filenames)
    print("There are {} files in the training directory: {}".format(n,training_dir))
    random.seed(35)  #if you want the same random split every time
    random.shuffle(filenames)
    index=int(n*split)
    return(filenames[:index],filenames[index:])

trainingfiles,heldoutfiles=get_training_testing()


There are 522 files in the training directory: lab3resources/sentence-completion/Holmes_Training_Data


In [2]:
len(trainingfiles)

261

In [3]:
trainingfiles[0]

'NDRTH10.TXT'

## 2  Building a unigram model

In [4]:
from nltk import word_tokenize as tokenize
import operator

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        #store the names of the files containing training data and run the training method
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        #initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram={}
        #process all of the training data, accumulating counts of events
        self._processfiles()
        #convert the accumulated counts to probabilities
        print("Finalising probability distribution")
        self._convert_to_probs()
        
    def _processline(self,line):
        #process each line of a file
        #each line is tokenized and has a special start and end token added
        #counts of tokens are added to the self.unigram count model
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
    
    
    def _processfiles(self):
        #process each file in turn
        for afile in self.files:
            print("Processing {}".format(afile))
            with open(os.path.join(self.training_dir,afile),errors='ignore') as instream:
                for i,line in enumerate(instream):
                    line=line.rstrip()
                    if len(line)>0:
                        self._processline(line)
      
            
    def _convert_to_probs(self):
        #self.unigram initially counts counts for each token {token:freq(token)}
        #sum all of the frequencies and divide each frequency by that sum to get probabilities
        
        total=sum(self.unigram.values())
        self.unigram={k:v/total for (k,v) in self.unigram.items()}
       
    def get_prob(self,token,method="unigram"):
        #simple look up method
        if method=="unigram":
            return self.unigram.get(token,0)
        else:
            print("Not implemented: {}".format(method))
            return 0
    

    
        
       

In [5]:
MAX_FILES=5
mylm=language_model(files=trainingfiles[:MAX_FILES])

Processing NDRTH10.TXT
Processing 3BOAT10.TXT
Processing DLANG10.TXT
Processing MRAMN10.TXT
Processing BLIXN10.TXT


In [None]:
mylm.get_prob("the",method="unigram")

In [None]:
mylm.get_prob("and")

In [None]:
mylm.get_prob("woman")

In [None]:
mylm.get_prob("car")

In [None]:
mylm.get_prob("dfgwrh")

In [None]:
mylm.get_prob("london")

### Generation

We can sort the probability distribution in order from high to low and look at the most likely unigrames

In [6]:
# this is some code to sort the probability distribution into order from high to low

mostlikely=sorted(list(mylm.unigram.items()),key=operator.itemgetter(1),reverse=True)

In [7]:
mostlikely[0]

('__START', 0.06813903104605357)

In [8]:
mostlikely[:10]

[('__START', 0.06813903104605357),
 ('__END', 0.06813903104605357),
 (',', 0.053578745249897565),
 ('the', 0.04002806543960708),
 ('.', 0.03598428549622),
 ('and', 0.025853411638052236),
 ('of', 0.01984665772214679),
 ('to', 0.01832287574347974),
 ('a', 0.01753554375450239),
 ('I', 0.011767131835260155)]

Given the k most likely unigrams, we can choose one at random using random.choice(mostlikely[:k]).  See code below

In [21]:
from nltk import word_tokenize as tokenize
import operator

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        self.unigram={}         
        self._processfiles()
        print("Finalising probability distribution")

        self._convert_to_probs()
        
    def _processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
    
    
    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            with open(os.path.join(self.training_dir,afile),errors='ignore') as instream:
                for line in instream:
                    line=line.rstrip()
                    if len(line)>0:
                        self._processline(line)
      
            
    def _convert_to_probs(self):
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
       
    def get_prob(self,token,method="unigram"):
        if method=="unigram":
            return self.unigram.get(token,0)
        else:
            print("Not implemented: {}".format(method))
            return 0
    
    def nextlikely(self,k=1):
        #use probabilities according to method to generate a likely next sequence
        #choose random token from k best
        specialtokens=["__START"]
           
       
        #sort the tokens by unigram probability
        mostlikely=sorted(list(self.unigram.items()),key=operator.itemgetter(1),reverse=True)
        #filter out any undesirable tokens
        filtered=[w for (w,p) in mostlikely if w not in specialtokens]
        #choose one randomly from the top k
        res=random.choice(filtered[:k])
        return res
    
    def generate(self,k=1,end="__END",limit=20):
        #a very simplistic way of generating likely tokens according to the model
        
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(k=k)
            tokens.append(current)
        return " ".join(tokens[:-1])

In [22]:
MAX_FILES=5
mylm=language_model(files=trainingfiles[:MAX_FILES])

Processing NDRTH10.TXT
Processing 3BOAT10.TXT
Processing DLANG10.TXT
Processing MRAMN10.TXT
Processing BLIXN10.TXT
Finalising probability distribution


In [23]:
mylm.nextlikely(10)

','

In [24]:
mylm.generate()

''

In [25]:
mylm.generate(k=10)

'the'

In [26]:
mylm.generate(k=5)

'the . the'

In [27]:
mylm.generate(k=5)

'the ,'

In [28]:
mylm.generate(k=500)

'those sledge I seen I line wish ten looked and King THE `` little wind What country we each'

Of course, this isn't very satisfactory as it will only ever generate a token which is in the top k words in the probability distribution.  And all of this words are sampled / generated with a uniform probability

This is the more sophisticated version where we sample according to the unigram probability distribution.

random.choices() takes 2 arguments - the list of items and the list of associated probabilities.  Try out the code below, varying the distribution in argument 2

In [32]:
for i in range(10):
    print(random.choices(["the","a"],[0.9,0.1]))

['the']
['the']
['the']
['the']
['the']
['the']
['the']
['the']
['the']
['the']


In [33]:
# split the distribution into a list of words and a list of probabilities
mostlikely=sorted(list(mylm.unigram.items()),key=operator.itemgetter(1),reverse=True)
words,probs=zip(*mostlikely)

In [34]:
probs[:10]

(0.06813903104605357,
 0.06813903104605357,
 0.053578745249897565,
 0.04002806543960708,
 0.03598428549622,
 0.025853411638052236,
 0.01984665772214679,
 0.01832287574347974,
 0.01753554375450239,
 0.011767131835260155)

In [35]:
#sample from the list of words according to the probability distribution in probs (i.e., words with higher probabilties are more likely to be picked)

random.choices(words,probs)[0]

','

In [36]:
from nltk import word_tokenize as tokenize
import operator

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        self.unigram={}         
        self._processfiles()
        print("Finalising probability distribution")

        self._convert_to_probs()
        
    def _processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
    
    
    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            
            with open(os.path.join(self.training_dir,afile),errors='ignore') as instream:
                for line in instream:
                    line=line.rstrip()
                    if len(line)>0:
                        self._processline(line)
            
      
            
    def _convert_to_probs(self):
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
       
    def get_prob(self,token,method="unigram"):
        if method=="unigram":
            return self.unigram.get(token,0)
        else:
            print("Not implemented: {}".format(method))
            return 0
    
    def nextlikely(self):
        #choose token randomly according to probability
        specialtokens=["__START"]
        
        #sort the tokens by unigram probability
        #mostlikely=sorted(list(self.unigram.items()),key=operator.itemgetter(1),reverse=True)
        mostlikely=list(self.unigram.items())
        #filter out any undesirable tokens
        filtered=[(w,p) for (w,p) in mostlikely if w not in specialtokens]
        words,probdist=zip(*filtered)
        
        res=random.choices(words,probdist)[0]  #choose one randomly according to prob

        return res
    
    def generate(self,k=1,end="__END",limit=20):
        #a very simplistic way of generating likely tokens according to the model
        
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely()
            tokens.append(current)
        return " ".join(tokens[:-1])

In [37]:
MAX_FILES=5
mylm=language_model(files=trainingfiles[:MAX_FILES])

Processing NDRTH10.TXT
Processing 3BOAT10.TXT
Processing DLANG10.TXT
Processing MRAMN10.TXT
Processing BLIXN10.TXT
Finalising probability distribution


In [38]:
mylm.nextlikely()

'Hi'

In [40]:
mylm.generate()

"I be the . and with fruit long aunt he that own handkerchief or , never the ' to"

## Adding bigrams


In [41]:
class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        self.unigram={}
        self.bigram={}
        self.train()
        
    def train(self):
        self._processfiles()
        print("Finalising probability distribution")

        self._convert_to_probs()
        
    
    def _processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
            ####
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            ###
            #self.bigram[previous][token]=self.bigram.get(previous,{}).get(token,0)+1
            ###
            previous=token
            
    
    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            with open(os.path.join(self.training_dir,afile),errors='ignore') as instream:
                for line in instream:
                    line=line.rstrip()
                    if len(line)>0:
                        self._processline(line)
           
      
            
    def _convert_to_probs(self):
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}
    
        
    def get_prob(self,token,context=[],method="unigram"):
        if method=="unigram":
            return self.unigram.get(token,0)
        elif method=="bigram":
            return self.bigram.get(context[-1],{}).get(token,0)
    
    
    def nextlikely(self,k=1,current="",method="unigram"):
        #use probabilities according to method to generate a likely next sequence
        #choose random token from k best
        blacklist=["__START"]
       
        if method=="unigram":
            dist=self.unigram
        else:
            dist=self.bigram.get(current,{})
            
        mostlikely=list(dist.items())
        #filter out any undesirable tokens
        filtered=[(w,p) for (w,p) in mostlikely if w not in blacklist]
        words,probdist=zip(*filtered)
        res=random.choices(words,probdist)[0]
    
        return res
    
    def generate(self,end="__END",limit=20,method="bigram"):
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(current=current,method=method)
            tokens.append(current)
        return " ".join(tokens[:-1])
    
        

In [42]:
MAX_FILES=5
mylm=language_model(files=trainingfiles[:MAX_FILES])

Processing NDRTH10.TXT
Processing 3BOAT10.TXT
Processing DLANG10.TXT
Processing MRAMN10.TXT
Processing BLIXN10.TXT
Finalising probability distribution


In [43]:
mylm.bigram

{'__END': {'__START': 1.0},
 '__START': {'*': 0.00255462977519258,
  '#': 0.00011790598962427291,
  'Copyright': 0.00015720798616569723,
  'the': 0.03167740921238799,
  'Please': 0.00039301996541424305,
  'We': 0.0071529633705392235,
  'electronic': 0.00019650998270712153,
  'Information': 0.00039301996541424305,
  'further': 0.000510925955038516,
  'The': 0.01623172457160824,
  'Tales': 7.860399308284861e-05,
  'by': 0.0016899858512812451,
  'November': 7.860399308284861e-05,
  'Corrected': 0.00019650998270712153,
  'VERSIONS': 0.00019650998270712153,
  'of': 0.012576638893255778,
  'midnight': 0.00027511397578997015,
  'Midnight': 0.00019650998270712153,
  'preliminary': 0.00019650998270712153,
  'and': 0.021773306083949066,
  'up': 0.002318817795944034,
  'in': 0.006445527432793586,
  'a': 0.008253419273699105,
  'look': 0.0006681339412042132,
  'new': 0.000510925955038516,
  'fifty': 0.00023581197924854583,
  'to': 0.010454331080018866,
  'searched': 0.00023581197924854583,
  'proj

In [44]:
mylm.get_prob("man",context=["an"],method="bigram")

0

In [45]:
mylm.generate()

'CONDUCT OF'

In [46]:
mylm.nextlikely(current="__START",method="bigram")

'have'

In [47]:
mylm.generate(method="unigram")

'and Langeais , leaves not million Labonga had conception glimpses said astonished of and would the evening help all'

In [48]:
mylm.generate(method="bigram")

'of that THEY would lend themselves . The fury of the dainty with an'

In [49]:
#tokens=["__START"]+tokenize("The cat sat on the mat")+["__END"]

In [50]:
#tokens

['__START', 'The', 'cat', 'sat', 'on', 'the', 'mat', '__END']

In [None]:
#for i, token in enumerate(tokens[1:]):
#    print(i,token,tokens[:i+1])

###  4 Perplexity

In [65]:
class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        self.unigram={}
        self.bigram={}
        self._processfiles()
        print("Finalising probability distribution")

        self._convert_to_probs()
        
    
    def _processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            previous=token
            
    
    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            
            with open(os.path.join(self.training_dir,afile),errors='ignore') as instream:
                for line in instream:
                    line=line.rstrip()
                    if len(line)>0:
                    
                        self._processline(line)
           
      
            
    def _convert_to_probs(self):
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}
    
        
    def get_prob(self,token,context=[],method="unigram"):
        if method=="unigram":
            return self.unigram.get(token,0)
        elif method=="bigram":
            return self.bigram.get(context[-1],{}).get(token,0)
    
    
    def nextlikely(self,current="",method="unigram"):
        #use probabilities according to method to generate a likely next sequence
        #choose random token from k best
        specialtokens=["__START"]
       
        if method=="unigram":
            dist=self.unigram
        else:
            dist=self.bigram.get(current,{})
    
        mostlikely=list(dist.items())
        #filter out any undesirable tokens
        filtered=[(w,p) for (w,p) in mostlikely if w not in specialtokens]
        #choose one randomly from the top k
        words,probdist=zip(*filtered)
        res=random.choices(words,probdist)[0]
        return res
    
    def generate(self,end="__END",limit=20,method="bigram"):
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(current=current,method=method)
            tokens.append(current)
        return " ".join(tokens[:-1])
    
    
    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns log probability together with number of tokens
    
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],method))
        return acc,len(tokens[1:])
       
    
    def compute_probability(self,filenames=[],method="unigram"):
        #computes the log probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files
        
        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self,filenames=[],method="unigram"):
        
        #compute the log probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data
        
        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp    

In [52]:
mylm=language_model(files=trainingfiles[:MAX_FILES])

Processing NDRTH10.TXT
Processing 3BOAT10.TXT
Processing DLANG10.TXT
Processing MRAMN10.TXT
Processing BLIXN10.TXT
Finalising probability distribution


In [53]:
mylm.generate(method='bigram')

'up a time , and visible . I brought them a medical point and turned to Kingston , the'

In [54]:
mylm.compute_perplexity()

Processing file 0:NDRTH10.TXT
Processing file 1:3BOAT10.TXT
Processing file 2:DLANG10.TXT
Processing file 3:MRAMN10.TXT
Processing file 4:BLIXN10.TXT


616.7373269768318

In [55]:
mylm.compute_perplexity(method='bigram')

Processing file 0:NDRTH10.TXT
Processing file 1:3BOAT10.TXT
Processing file 2:DLANG10.TXT
Processing file 3:MRAMN10.TXT
Processing file 4:BLIXN10.TXT


52.18346984478202

### 5 Dealing with Unknowns
What happens when you compute the perplexity of a corpus not used in training?
Zero probabilities ....

In [56]:
mylm.compute_perplexity(filenames=heldoutfiles[:MAX_FILES])

Processing file 0:PLGRM10.TXT


ValueError: math domain error

In [57]:
mylm.unigram

{'__START': 0.06813903104605357,
 '*': 0.0011622519837284722,
 'The': 0.0033421439532099846,
 'Project': 0.00032135999550096005,
 'Gutenberg': 0.00018745999737556003,
 'Etext': 6.427199910019202e-05,
 'of': 0.01984665772214679,
 'Moon': 1.8745999737556003e-05,
 'Endureth': 1.3389999812540003e-05,
 'by': 0.0026003379635952686,
 'Buchan': 1.6067999775048005e-05,
 '__END': 0.06813903104605357,
 '#': 2.1423999700064004e-05,
 '5': 8.033999887524002e-06,
 'in': 0.010939629846845182,
 'our': 0.0010899459847407563,
 'series': 2.6779999625080007e-05,
 'John': 5.891599917517601e-05,
 'Copyright': 1.0711999850032002e-05,
 'laws': 5.088199928765201e-05,
 'are': 0.0017728359751802963,
 'changing': 3.213599955009601e-05,
 'all': 0.0031520059558719166,
 'over': 0.0011354719841033922,
 'the': 0.04002806543960708,
 'world': 0.0005570239922016641,
 ',': 0.053578745249897565,
 'be': 0.0034037379523476686,
 'sure': 0.00016603599767549604,
 'to': 0.01832287574347974,
 'check': 3.4813999512604004e-05,
 'cop

In [58]:
rarewords=[]
for k,v in mylm.unigram.items():
    if v < 0.05:
        rarewords.append(k)
        
for word in rarewords:
    del mylm.unigram[word]

In [59]:
mylm.unigram

{'__START': 0.06813903104605357,
 '__END': 0.06813903104605357,
 ',': 0.053578745249897565}

In [60]:
class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        self.train()
        
    def train(self):
        self.unigram={}
        self.bigram={}
         
        self.processfiles()
        print("Removing rare words")
        self.make_unknowns()
        print("Finalising probability distribution")

        self.convert_to_probs()
        
    
    def processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            previous=token
            
    
    def processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            with open(os.path.join(self.training_dir,afile),errors='ignore') as instream:
                for line in instream:
                    line=line.rstrip()
                    if len(line)>0:
                        self.processline(line)
            
      
            
    def convert_to_probs(self):
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}
    
        
    def get_prob(self,token,context="",method="unigram"):
        if method=="unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))
        elif method=="bigram":
            bigram=self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
            return bigram.get(token,bigram.get("__UNK",0))
    
    
    def nextlikely(self,current="",method="unigram"):
        blacklist=["__START"]
       
        if method=="unigram":
            dist=self.unigram
        else:
            dist=self.bigram.get(current,{})
    
        mostlikely=list(dist.items())
        #filter out any undesirable tokens
        filtered=[(w,p) for (w,p) in mostlikely if w not in blacklist]
        #choose one randomly from the top k
        words,probdist=zip(*filtered)
        res=random.choices(words,probdist)[0]
        return res
    
    def generate(self,end="__END",limit=20,method="bigram"):
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(current=current,method=method)
            tokens.append(current)
        return " ".join(tokens[:-1])
    
    
    
    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens
        
        
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],method))
        return acc,len(tokens[1:])
       
    
    def compute_probability(self,filenames=[],method="unigram"):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files
        
        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self,filenames=[],method="unigram"):
        
        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data
        
        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp  
    
    def make_unknowns(self,known=2):
        unknown=0
        for (k,v) in list(self.unigram.items()):
            if v<known:
                del self.unigram[k]
                self.unigram["__UNK"]=self.unigram.get("__UNK",0)+v
        for (k,adict) in list(self.bigram.items()):
            for (kk,v) in list(adict.items()):
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    adict["__UNK"]=adict.get("__UNK",0)+v
                    del adict[kk]
            isknown=self.unigram.get(k,0)
            if isknown==0:
                del self.bigram[k]
                current=self.bigram.get("__UNK",{})
                current.update(adict)
                self.bigram["__UNK"]=current
                
            else:
                self.bigram[k]=adict

In [61]:
mylm=language_model(files=trainingfiles[:MAX_FILES])
p=mylm.compute_perplexity()
print("Training data unigram perplexity: {}".format(p))
p=mylm.compute_perplexity(filenames=heldoutfiles[:MAX_FILES])
print("Testing data unigram perplexity: {}".format(p))


Processing NDRTH10.TXT
Processing 3BOAT10.TXT
Processing DLANG10.TXT
Processing MRAMN10.TXT
Processing BLIXN10.TXT
Removing rare words
Finalising probability distribution
Processing file 0:NDRTH10.TXT
Processing file 1:3BOAT10.TXT
Processing file 2:DLANG10.TXT
Processing file 3:MRAMN10.TXT
Processing file 4:BLIXN10.TXT
Training data unigram perplexity: 475.7186614474083
Processing file 0:PLGRM10.TXT
Processing file 1:FALK10.TXT
Processing file 2:JNGLB10.TXT
Processing file 3:PTPED10.TXT
Processing file 4:ANNE11.TXT
Testing data unigram perplexity: 357.3697320922639


In [62]:
p=mylm.compute_perplexity(method="bigram")
print("Training data bigram perplexity: {}".format(p))
p=mylm.compute_perplexity(filenames=heldoutfiles[:MAX_FILES],method="bigram")
print("Testing data bigram perplexity: {}".format(p))


Processing file 0:NDRTH10.TXT
Processing file 1:3BOAT10.TXT
Processing file 2:DLANG10.TXT
Processing file 3:MRAMN10.TXT
Processing file 4:BLIXN10.TXT
Training data bigram perplexity: 56.93348358140645
Processing file 0:PLGRM10.TXT


ValueError: math domain error

In [63]:
mylm.generate(method='unigram')

'pleasant'

In [64]:
mylm.generate(method='bigram')

"__UNK curls , who squatted down there is belied . It can do n't going to love with basket"

## Smoothing / Discounting
As well as dealing with previously unseen tokens, we also need to deal with unseen token combinations.

One approach is Laplacian (add-k) smoothing.  However, this is slow (takes a lot of space in memory) and tends to assign too much probability mass to unseen combinations.

An alternative is discounting.  Take a small amount away from each observed co-occurrence count (e.g., 0.75) and reserve this for "unseen" co-occurrences.  This is distributed between all words according to their unigram (or some other) probability

In [66]:
class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        self.training_dir=trainingdir
        self.files=files
        self.train()
    
    def train(self):
        self.unigram={}
        self.bigram={}
         
        self._processfiles()
        print("Removing rare words")
        self._make_unknowns()
        print("Applying discounts")
        self._discount()
        print("Finalising probability distributions")
        self._convert_to_probs()
        
    
    def _processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            previous=token
            
    
    def _processfiles(self):
        for afile in self.files:
            print("Processing {}".format(afile))
            with open(os.path.join(self.training_dir,afile),errors='ignore') as instream:
                for line in instream:
                    line=line.rstrip()
                    if len(line)>0:
                        self._processline(line)
                  
            
    def _convert_to_probs(self):
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}
    
        
    def get_prob(self,token,context=[],method="unigram"):
        if method=="unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))
        elif method=="bigram":
            bigram=self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
            big_p=bigram.get(token,bigram.get("__UNK",0))
            lmbda=bigram["__DISCOUNT"]
            uni_p=self.unigram.get(token,self.unigram.get("__UNK",0))
            #print(big_p,lmbda,uni_p)
            p=big_p+lmbda*uni_p            
            return p
    
    
    def nextlikely(self,current="",method="unigram"):
        blacklist=["__START","__DISCOUNT"]
       
        if method=="unigram":
            dist=self.unigram
        else:
            dist=self.bigram.get(current,{})
    
        mostlikely=list(dist.items())
        #filter out any undesirable tokens
        filtered=[(w,p) for (w,p) in mostlikely if w not in blacklist]
        #print(current,len(filtered))
        #choose one randomly from the top k
        words,probdist=zip(*filtered)
        res=random.choices(words,probdist)[0]
        return res
    
    def generate(self,end="__END",limit=20,method="bigram"):
        current="__START"
        tokens=[]
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(current=current,method=method)
            tokens.append(current)
        return " ".join(tokens[:-1])
    
    
    
    def compute_prob_line(self,line,method="unigram"):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens
        
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],method))
        return acc,len(tokens[1:])
    
    def compute_probability(self,filenames=[],method="unigram"):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files
        
        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,method=method)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self,filenames=[],method="unigram"):
        
        #compute the probability and length of the corpus
        #calculate perplexity
        #lower perplexity means that the model better explains the data
        
        p,N=self.compute_probability(filenames=filenames,method=method)
        #print(p,N)
        pp=math.exp(-p/N)
        return pp  
    
    def _make_unknowns(self,known=2):
        unknown=0
        for (k,v) in list(self.unigram.items()):
            if v<known:
                del self.unigram[k]
                self.unigram["__UNK"]=self.unigram.get("__UNK",0)+v
        for (k,adict) in list(self.bigram.items()):
            for (kk,v) in list(adict.items()):
                isknown=self.unigram.get(kk,0)
                if isknown==0:
                    adict["__UNK"]=adict.get("__UNK",0)+v
                    del adict[kk]
            isknown=self.unigram.get(k,0)
            if isknown==0:
                del self.bigram[k]
                current=self.bigram.get("__UNK",{})
                current.update(adict)
                self.bigram["__UNK"]=current
                
            else:
                self.bigram[k]=adict
                
    def _discount(self,discount=0.75):
        #discount each bigram count by a small fixed amount
        self.bigram={k:{kk:value-discount for (kk,value) in adict.items()}for (k,adict) in self.bigram.items()}
        
        #for each word, store the total amount of the discount so that the total is the same 
        #i.e., so we are reserving this as probability mass
        for k in self.bigram.keys():
            lamb=len(self.bigram[k])
            self.bigram[k]["__DISCOUNT"]=lamb*discount
     

In [67]:
MAX_FILES=5

filesets={"training":trainingfiles[:MAX_FILES],"testing":heldoutfiles[:MAX_FILES]}


mylm=language_model(files=filesets["training"])
methods=["unigram","bigram"]
#methods=["bigram"]

for f,names in list(filesets.items()):
    for m in methods:

        p=mylm.compute_perplexity(filenames=names,method=m)
        
        print("Perplexity on {} with {} method is {}".format(f,m,p))


Processing NDRTH10.TXT
Processing 3BOAT10.TXT
Processing DLANG10.TXT
Processing MRAMN10.TXT
Processing BLIXN10.TXT
Removing rare words
Applying discounts
Finalising probability distributions
Processing file 0:NDRTH10.TXT
Processing file 1:3BOAT10.TXT
Processing file 2:DLANG10.TXT
Processing file 3:MRAMN10.TXT
Processing file 4:BLIXN10.TXT
Perplexity on training with unigram method is 475.7186614474083
Processing file 0:NDRTH10.TXT
Processing file 1:3BOAT10.TXT
Processing file 2:DLANG10.TXT
Processing file 3:MRAMN10.TXT
Processing file 4:BLIXN10.TXT
Perplexity on training with bigram method is 71.91380452189523
Processing file 0:PLGRM10.TXT
Processing file 1:FALK10.TXT
Processing file 2:JNGLB10.TXT
Processing file 3:PTPED10.TXT
Processing file 4:ANNE11.TXT
Perplexity on testing with unigram method is 357.3697320922639
Processing file 0:PLGRM10.TXT
Processing file 1:FALK10.TXT
Processing file 2:JNGLB10.TXT
Processing file 3:PTPED10.TXT
Processing file 4:ANNE11.TXT
Perplexity on testing w

In [68]:
mylm.generate(method="unigram")

"'' , `` __UNK"

In [69]:
mylm.generate(method="bigram")

'connection with'