<a href="https://colab.research.google.com/github/LordLean/sharing-github/blob/master/anle1_ngram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random, math
import numpy as np
import pandas as pd
from nltk import word_tokenize as tokenize

import operator
import nltk 
nltk.download("punkt")
nltk.download("wordnet")

import tqdm

# Download lab2 resources.
os.system("gdown --id 1H26pdLFh2cDxU-NkflQHzcNCYWUgHCbX")
os.system("unzip lab2resources.zip")

# Download scc resources.
os.system("gdown --id 155TLf2OdXtvfPD8VsWwI2YlHfjS8ph04")

from IPython.display import clear_output
clear_output()

In [None]:
def get_training_testing(training_dir,split=0.5):

    filenames=os.listdir(training_dir)
    n=len(filenames)
    print("There are {} files in the training directory: {}".format(n,training_dir))
    # random.seed(53)  #if you want the same random split every time
    random.shuffle(filenames)
    index=int(n*split)
    trainingfiles=filenames[:index]
    heldoutfiles=filenames[index:]
    return trainingfiles,heldoutfiles

In [None]:
parentdir="lab2resources/sentence-completion"
trainingdir=os.path.join(parentdir,"Holmes_Training_Data")
training,testing=get_training_testing(trainingdir)

There are 522 files in the training directory: lab2resources/sentence-completion/Holmes_Training_Data


In [119]:
class n_gram_language_model():
    
    def __init__(self,trainingdir,files=[], construct_params={}):
        self.training_dir=trainingdir
        self.files=files
        # Constructor Parameters.
        self.construct_params=construct_params
        self.train()
        
    def train(self):    
        self.unigram={}
        self.bigram={}
        self.trigram={}
        self.quad_gram={}
         
        self._processfiles()
        self._make_unknowns(known=self.construct_params.get("known",2))
        self._discount()
        self._convert_to_probs()

        self.magic_counter = {"trigram": 0, "quad_gram":0}
        self.super_counter = {"trigram": 0, "quad_gram":0}
        
    
    def _processline(self,line):
        tokens=["__START"]+tokenize(line)+["__END"]
        previous="__END"
        for i, token in enumerate(tokens):
            # Unigram
            self.unigram[token]=self.unigram.get(token,0)+1
            # Bigram
            current=self.bigram.get(previous,{})
            current[token]=current.get(token,0)+1
            self.bigram[previous]=current
            previous=token
            # Trigram
            if i < len(tokens)-2:
              # Next words.
              next = tokens[i+1] 
              next_next = tokens[i+2]
              # Get dictionaries.
              inner = self.trigram.get(token,{})
              innermost = inner.get(next,{})
              innermost[next_next] = innermost.get(token,0) + 1
              # Write frequencies to dictionaries.
              inner[next] = innermost
              self.trigram[token] = inner
            # 4-gram
            if i < len(tokens)-3:
              # Next words.
              next1 = tokens[i+1] 
              next2 = tokens[i+2]
              next3 = tokens[i+3]
              # Get dictionaries.
              inner1 = self.quad_gram.get(token,{})
              inner2 = inner1.get(next1,{})
              inner3 = inner2.get(next2,{})
              inner3[next3] = inner3.get(token,0) + 1
              # Write frequencies to dictionaries.
              inner2[next2] = inner3
              inner1[next1] = inner2
              self.quad_gram[token] = inner1

                      
            
    
    def _processfiles(self):
        for afile in tqdm.tqdm(self.files):
            # print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring rest of file".format(afile))
      
            
    def _convert_to_probs(self):
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram={key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}
        self.trigram={k1:{k2:{k3:v/sum(adict2.values()) for k3, v in adict2.items()} for k2, adict2 in adict1.items()} for k1, adict1 in self.trigram.items()}
        self.quad_gram={k1:{k2:{k3:{k4:v/sum(adict3.values()) for k4, v in adict3.items()} for k3, adict3 in adict2.items()} for k2, adict2 in adict1.items()} for k1, adict1 in self.quad_gram.items()}
        self.kn={k:v/sum(self.kn.values()) for (k,v) in self.kn.items()}
    

    def nextlikely(self,k=1,current="",method="unigram"):
        #use probabilities according to method to generate a likely next sequence
        #choose random token from k best
        blacklist=["__START","__UNK","__DISCOUNT"]
        most_likely = []
        if method=="unigram":
            dist=self.unigram
            #sort the tokens by unigram probability
            most_likely=sorted(list(dist.items()),key=operator.itemgetter(1),reverse=True)
        elif method == "bigram":
            dist=self.bigram.get(current,self.bigram.get("__UNK",{}))
            most_likely=sorted(list(dist.items()),key=operator.itemgetter(1),reverse=True)
        elif method == "trigram":
            # Split context string for first and second context words.
            context = current.split()
            c1, c2 = context[0], context[1]
            dist = self.trigram[c1][c2]
            # Get all words with maximum value.
            most_likely = [(k, _) for k, v in dist.items() if v == max(dist.values())]
        elif method == "quad_gram":
            context = current.split(" ")
            c1,c2,c3 = context[0], context[1], context[2]
            dist = self.quad_gram[c1][c2][c3]
            most_likely = [(k, _) for k, v in dist.items() if v == max(dist.values())]

        #filter out any undesirable tokens
        filtered=[w for (w,p) in most_likely if w not in blacklist]
        #choose one randomly from the top k
        res=random.choice(filtered[:k])
        return res
    
    def generate(self,k=3,end="__END",limit=20,method="bigram",methodparams={}):
        if method=="":
            method=methodparams.get("method","bigram")
        current="__START"
        tokens=[]
        try: 
          # Trigram
          if method=="trigram":
            context_1 = current
            context_2 = random.choice([key for key, adict in self.trigram[current].items()])
            while context_2 != end and len(tokens)<limit:
              current = " ".join([context_1, context_2])
              current = self.nextlikely(k=k, current=current, method=method)
              tokens.append(current)
              context_1 = context_2
              context_2 = current
            return " ".join(tokens[:-1])
          # Quad-Gram
          elif method == "quad_gram":
            context_1 = current
            context_2 = random.choice([key for key, adict in self.quad_gram[context_1].items()])
            context_3 = random.choice([key for key, adict in self.quad_gram[context_1][context_2].items()])
            while context_3 != end and len(tokens) < limit: 
              current = " ".join([context_1, context_2, context_3])
              current = self.nextlikely(k=k, current=current, method=method)
              tokens.append(current)
              context_1 = context_2
              context_2 = context_3
              context_3 = current
            return " ".join(tokens[:-1])
        except:
          return self.generate(k=k,end=end,limit=limit,method=method,methodparams=methodparams)
        while current!=end and len(tokens)<limit:
            current=self.nextlikely(k=k,current=current,method=method)
            tokens.append(current)
        return " ".join(tokens[:-1])
    
    
    def get_prob(self,token,context="",methodparams={}):
        if methodparams.get("method","unigram")=="unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))
        else:
            if methodparams.get("smoothing","kneser-ney")=="kneser-ney":
                unidist=self.kn
            else:
                unidist=self.unigram
            bigram=self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
            big_p=bigram.get(token,bigram.get("__UNK",0))
            lmbda=bigram["__DISCOUNT"]
            uni_p=unidist.get(token,unidist.get("__UNK",0))
            #print(big_p,lmbda,uni_p)
            p=big_p+lmbda*uni_p            
            return p
    
    
    def compute_prob_line(self,line,methodparams={}):
        #this will add _start to the beginning of a line of text
        #compute the probability of the line according to the desired model
        #and returns probability together with number of tokens
        
        tokens=["__START"]+tokenize(line)+["__END"]
        acc=0
        acc+=math.log(1.0001)
        if methodparams.get("method", "unigram") in ["unigram", "bigram"]:
          for i,token in enumerate(tokens[1:]):
            acc+=math.log(self.get_prob(token,tokens[:i+1],methodparams))
          return acc,len(tokens[1:])
        # Trigram - nice & clean, endless if else statements - ha. 
        if methodparams.get("method") == "trigram":
          self.super_counter["trigram"] +=1
          try:
            for i, token in enumerate(tokens[1:]):
              if i < len(tokens[1:]) - 3 and len(tokens[1:]) >= 3:
                word1, word2, word3 = tokens[i+1], tokens[i+1+1], tokens[i+1+2]
                if word1 in self.trigram:
                  if word2 in self.trigram[word1]:
                    if word3 in self.trigram[word1][word2]:
                      acc+=math.log(self.trigram[word1][word2][word3])
                    else:
                      acc+=math.log(self.trigram[word1][word2]["__UNK"])
                  else:
                    if word3 in self.trigram[word1]["__UNK"]:
                      acc+=math.log(self.trigram[word1]["__UNK"][word3])
                    else: 
                      acc+=math.log(self.trigram[word1]["__UNK"]["__UNK"])
                else:
                  if word2 in self.trigram["__UNK"]:
                    if word3 in self.trigram["__UNK"][word2]:
                      acc+=math.log(self.trigram["__UNK"][word2][word3])
                    else:
                      acc+=math.log(self.trigram["__UNK"][word2]["__UNK"])
                  else:
                    if word3 in self.trigram["__UNK"]["__UNK"]:
                      acc+=math.log(self.trigram["__UNK"]["__UNK"][word3])
                    else:
                      acc+=math.log(self.trigram["__UNK"]["__UNK"]["__UNK"])
            return acc, len(tokens[1:])
          except KeyError:
            self.magic_counter["trigram"] += 1
            return acc, len(tokens[1:]) 
        # Quad_gram - same as above. FYI - if else if statements are used rather than if elif to enhance readability.
        if methodparams.get("method") == "quad_gram":
          self.super_counter["quad_gram"] +=1
          try:
            for i, token in enumerate(tokens[1:]):
              if i < len(tokens[1:]) - 4 and len(tokens[1:]) >= 4:
                word1, word2, word3, word4 = tokens[i+1], tokens[i+1+1], tokens[i+1+2], tokens[i+1+3]
                if word1 in self.quad_gram:
                  if word2 in self.quad_gram[word1]:
                    if word3 in self.quad_gram[word1][word2]:
                      if word4 in self.quad_gram[word1][word2][word3]:
                        acc+=math.log(self.quad_gram[word1][word2][word3][word4])
                      elif "__UNK" in self.quad_gram[word1][word2][word3]:
                        acc+=math.log(self.quad_gram[word1][word2][word3]["__UNK"])
                    else:
                      if word4 in self.quad_gram[word1][word2]["__UNK"]:
                        acc+=math.log(self.quad_gram[word1][word2]["__UNK"][word4])
                      elif "__UNK" in self.quad_gram[word1][word2]["__UNK"]:
                        acc+=math.log(self.quad_gram[word1][word2]["__UNK"]["__UNK"])
                  else:
                    if "__UNK" in self.quad_gram[word1]:
                      if word3 in self.quad_gram[word1]["__UNK"]:
                        if word4 in self.quad_gram[word1]["__UNK"][word3]:
                          acc+=math.log(self.quad_gram[word1]["__UNK"][word3][word4])
                        elif "__UNK" in self.quad_gram[word1]["__UNK"][word3]:
                          acc+=math.log(self.quad_gram[word1]["__UNK"][word3]["__UNK"])
                      else:
                        if "__UNK" in self.quad_gram[word1]["__UNK"]:
                          if word4 in self.quad_gram[word1]["__UNK"]["__UNK"]:
                            acc+=math.log(self.quad_gram[word1]["__UNK"]["__UNK"][word4])
                          elif "__UNK" in self.quad_gram[word1]["__UNK"]["__UNK"]:
                            acc+=math.log(self.quad_gram[word1]["__UNK"]["__UNK"]["__UNK"]
                else:
                  if "__UNK" in self.quad_gram:
                    if word2 in self.quad_gram["__UNK"]:
                      if word3 in self.quad_gram["__UNK"][word2]:
                        if word4 in self.quad_gram["__UNK"][word2][word3]:
                          acc+=math.log(self.quad_gram["__UNK"][word2][word3][word4])
                        elif "__UNK" in self.quad_gram["__UNK"][word2][word3]:
                          acc+=math.log(self.quad_gram["__UNK"][word2][word3]["__UNK"])
                      else:
                        if word4 in self.quad_gram["__UNK"][word2]["__UNK"]:
                          acc+=math.log(self.quad_gram["__UNK"][word2]["__UNK"][word4])
                        elif "__UNK" in self.quad_gram["__UNK"][word2]["__UNK"]:
                          acc+=math.log(self.quad_gram["__UNK"][word2]["__UNK"]["__UNK"])
                    else:
                      if "__UNK" in self.quad_gram["__UNK"]:
                        if word3 in self.quad_gram["__UNK"]["__UNK"]:
                          if word4 in self.quad_gram["__UNK"]["__UNK"][word3]:
                            acc+=math.log(self.quad_gram["__UNK"]["__UNK"][word3][word4])
                          elif "__UNK" in self.quad_gram["__UNK"]["__UNK"][word3]:
                            acc+=math.log(self.quad_gram["__UNK"]["__UNK"][word3]["__UNK"])
                        else:
                          if "__UNK" in self.quad_gram["__UNK"]["__UNK"]:
                            if word4 in self.quad_gram["__UNK"]["__UNK"]["__UNK"]:
                              acc+=math.log(self.quad_gram["__UNK"]["__UNK"]["__UNK"][word4])
                            elif "__UNK" in self.quad_gram["__UNK"]["__UNK"]["__UNK"]:
                              acc+=math.log(self.quad_gram["__UNK"]["__UNK"]["__UNK"]["__UNK"]
            return acc, len(tokens[1:])
          except KeyError:
            self.magic_counter["quad_gram"] += 1
            return acc, len(tokens[1:]) 
            
    
    def compute_probability(self,filenames=[],methodparams={}):
        #computes the probability (and length) of a corpus contained in filenames
        if filenames==[]:
            filenames=self.files
        
        total_p=0
        total_N=0
        for i,afile in enumerate(filenames):
            print("Processing file {}:{}".format(i,afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            p,N=self.compute_prob_line(line,methodparams=methodparams)
                            total_p+=p
                            total_N+=N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N
    
    def compute_perplexity(self,filenames=[],methodparams={"method":"bigram","smoothing":"kneser-ney"}):
        """
        compute the probability and length of the corpus
        calculate perplexity
        lower perplexity means that the model better explains the data
        """
        p,N=self.compute_probability(filenames=filenames,methodparams=methodparams)
        # print(p,N)
        if methodparams.get("method") in ["trigram", "quad_gram"]:
          rem = self.super_counter[methodparams.get("method")] - self.magic_counter[methodparams.get("method")]
          pp=math.exp(-p/N) * (self.super_counter[methodparams.get("method")]/rem)
          return pp
        pp=math.exp(-p/N)
        return pp  
    

    def _make_unknowns(self,known=2):
        # Unigram -----------------------------------
        for (k,v) in list(self.unigram.items()):
            if v<known:
                del self.unigram[k]
                self.unigram["__UNK"]=self.unigram.get("__UNK",0)+v
        # Bigram -----------------------------------
        for (k,adict) in list(self.bigram.items()):
            for (kk,v) in list(adict.items()):
                isknown=self.unigram.get(kk,0)
                if isknown <= known:
                    adict["__UNK"]=adict.get("__UNK",0)+v
                    del adict[kk]
            isknown=self.unigram.get(k,0)
            if isknown <= known:
                del self.bigram[k]
                current=self.bigram.get("__UNK",{})
                current.update(adict)
                self.bigram["__UNK"]=current
            else:
                self.bigram[k]=adict
        # Trigram -----------------------------------
        for (k1, dict1) in list(self.trigram.items()):
          for (k2, dict2) in list(dict1.items()):
            for (k3, val) in list(dict2.items()):
              isknown=self.unigram.get(k3,0)
              if isknown == 0:
                dict2["__UNK"] = dict2.get("__UNK",0) + val
                del dict2[k3]
            isknown=self.unigram.get(k2,0)
            if isknown <= known:
              del self.trigram[k1][k2]
              current=self.trigram[k1].get("__UNK",{})
              current.update(dict2)
              self.trigram[k1]["__UNK"] = current
            else:
              self.trigram[k1][k2] = dict2
          # For first token:
          isknown=self.unigram.get(k1,0)
          if isknown <= known:
            del self.trigram[k1]
            current = self.trigram.get("__UNK",{})
            current.update(dict1)
            self.trigram["__UNK"] = current 
          else:
            self.trigram[k1] = dict1
        # Quad Gram -----------------------------------
        for (k1, dict1) in list(self.quad_gram.items()):
          for (k2, dict2) in list(dict1.items()):
            for (k3, dict3) in list(dict2.items()):
              for (k4, val) in list(dict3.items()):
                # Next
                isknown = self.unigram.get(k4,0)
                if isknown <= known:
                  dict3["__UNK"] = dict3.get("__UNK",0) + val
                  del dict3[k4]
              # Next
              isknown=self.unigram.get(k3,0)
              if isknown <= known:
                del self.quad_gram[k1][k2][k3]
                current = self.quad_gram[k1][k2].get("__UNK", {})
                current.update(dict3)
                self.quad_gram[k1][k2]["__UNK"] = current
              else:
                self.quad_gram[k1][k2][k3] = dict3
            # Next
            isknown=self.unigram.get(k2,0)
            if isknown <= known:
              del self.quad_gram[k1][k2]
              current = self.quad_gram[k1].get("__UNK",{})
              current.update(dict2)
              self.quad_gram[k1]["__UNK"] = current
            else:
              self.quad_gram[k1][k2] = dict2
          # Next
          isknown=self.unigram.get(k1,0)
          if isknown <= known:
            del self.quad_gram[k1]
            current = self.quad_gram.get("__UNK", {})
            current.update(dict1)
            self.quad_gram["__UNK"] = current
          else:
            self.quad_gram[k1] = dict1



                
    def _discount(self,discount=0.75):
        #discount each bigram count by a small fixed amount
        self.bigram={k:{kk:value-discount for (kk,value) in adict.items()}for (k,adict) in self.bigram.items()}
        
        #for each word, store the total amount of the discount so that the total is the same 
        #i.e., so we are reserving this as probability mass
        for k in self.bigram.keys():
            lamb=len(self.bigram[k])
            self.bigram[k]["__DISCOUNT"]=lamb*discount
            
        #work out kneser-ney unigram probabilities
        #count the number of contexts each word has been seen in
        self.kn={}
        for (k,adict) in self.bigram.items():
            for kk in adict.keys():
                self.kn[kk]=self.kn.get(kk,0)+1
    

SyntaxError: ignored

In [None]:
MAX_FILES=5

construct_params = {
    "known" : 0
}

# Initialize n-gram language model.
lm=n_gram_language_model(trainingdir=trainingdir,files=training[:MAX_FILES], construct_params=construct_params)

100%|██████████| 5/5 [00:08<00:00,  1.68s/it]


In [None]:
def get_lengths(lm):
  len_dict = {}
  # Unigram
  len_dict["unigram"] = len(lm.unigram)
  # Bigram
  total = 0
  for k1, d1, in lm.bigram.items():
    total += len(d1)
  len_dict["bigram"] = total
  # Trigram
  total = 0
  for k1, d1, in lm.trigram.items():
    for k2, d2 in d1.items():
      total += len(d2)
  len_dict["trigram"] = total
  # Quad_gram
  total = 0
  for k1, d1, in lm.quad_gram.items():
    for k2, d2 in d1.items():
      for k3, d3 in d2.items():
        total += len(d3)
  len_dict["quad_gram"] = total

  return len_dict

In [None]:
get_lengths(lm)

{'bigram': 171398, 'quad_gram': 383331, 'trigram': 318582, 'unigram': 20184}

In [None]:
lm.files

['TOTAM10.TXT', 'PRESC10.TXT', 'WLDFL10.TXT', 'BCITY10.TXT', '39STP10.TXT']

In [None]:
for method in ["unigram", "bigram", "trigram"]:
  print(method)
  print(lm.compute_perplexity(["BABSU10.TXT"], methodparams={"method":method}))
  print()

unigram
Processing file 0:BABSU10.TXT
66.32139148438647

bigram
Processing file 0:BABSU10.TXT
32.41842930441104

trigram
Processing file 0:BABSU10.TXT
21.412328391670034



In [None]:
lm.super_counter, lm.magic_counter

({'quad_gram': 0, 'trigram': 0}, {'quad_gram': 0, 'trigram': 0})

In [None]:
s = "__START this is a test sentence hello"
s = s.split()
for i, token in enumerate(s):
  if i < len(s) - 2:
    print(token)

__START
this
is
a
test


In [None]:
[lm.generate(method="quad_gram", limit=30) for i in range(100)]

["how much of their neighbor 's affairs the new generations",
 "has a child will do very well . ''",
 'fast as they could . You',
 'and looked out over Shoshone Land , but before',
 'half so large',
 'I had bought ground and built me a house beside',
 'days , till they fell in the traces to jerk',
 'upon him with the coming of the night',
 ', eating their bit',
 'time Spitz went through , dragging the whole team after',
 'in great numbers',
 '',
 'to Las Uvas',
 'But the weather',
 ', the death of one or the',
 'far back in the canon tangles is more',
 'But if you',
 'little apart from his family that he might meet it as became',
 "''",
 'the wild almond passes into the',
 ', you can receive',
 'down the Yukon',
 'and Salt Water . Perrault was a',
 'rain is over and gone they are stirred by the',
 "''",
 'with the',
 'cunning brush shelters from which the Shoshones shot',
 'mesa one sees',
 ', though not half so large',
 'surprised , too ;',
 'he forgot the pain of',
 'had',
 'pines wh

In [None]:
s = "he went to his desk after breakfast , these remarks : `` __UNK about"
s = s.split()
word1, word2, word3 = s[-3],s[-2],s[-1]
lm.quad_gram[word1][word2][word3]

KeyError: ignored

repeated quad_grams are so infrequent its often equal probability for the next word.

# Questions and Answers


In [None]:
import pandas as pd, csv
questions=os.path.join(parentdir,"testing_data.csv")
answers=os.path.join(parentdir,"test_answer.csv")

with open(questions) as instream:
    csvreader=csv.reader(instream)
    lines=list(csvreader)
qs_df=pd.DataFrame(lines[1:],columns=lines[0])

##  Building and evaluating an SCC system


In [None]:
class question:
    

    def __init__(self,aline):
        self.fields=aline
        self.num2letter = {
            0:"a",
            1:"b",
            2:"c",
            3:"d",
            4:"e"
            }
        self.tokenized = tokenize(self.fields[1])
        self.options = self.fields[2:7]
          

    def get_field(self,field):
        return self.fields[question.colnames[field]]
    

    def add_answer(self,fields):
        self.answer=fields[1]
   

    def get_context(self,window,target="_____",method="left"):
      for i, token in enumerate(self.tokenized):
        if token == target:
          if method=="left":
            return self.tokenized[i-window:i]
          elif method=="right": 
            return self.tokenized[i+1:i+1+window]


    def chooseA(self):
        return("a")


    def random(self):
      """
      Retrun random choice of letter.
      """
      return random.choice(self.num2letter)


    def unigram(self):
      """
      Return position of word with greatest unigram probability. 0 otherwise.
      """
      option_probs = [lm.unigram[word] if word in lm.unigram else 0 for word in self.options]
      index = option_probs.index(max(option_probs))
      return self.num2letter[index]


    def bigram(self, context_dir="left"):
      """
      Return position of word-pair with greatest bigram probability. 0 otherwise. 
      """
      context = self.get_context(1, method=context_dir)[0] # [0] to delist context.
      if context_dir == "left":
        option_probs = [lm.bigram[context][word] if context in lm.bigram and word in lm.bigram[context] else 0 for word in self.options]
      elif context_dir == "right":
        option_probs = [lm.bigram[word][context] if word in lm.bigram and context in lm.bigram[word] else 0 for word in self.options]
      index = option_probs.index(max(option_probs))
      return self.num2letter[index]


    def trigram(self, context_dir="left"):
      """
      Return position of word-group with greatest trigram probability. 0 otherwise. 
      """
      option_probs = []
      context = self.get_context(2, method=context_dir)
      if context_dir == "left":
        for word in self.options:
          if context[0] in lm.trigram and context[1] in lm.trigram[context[0]] and word in lm.trigram[context[0]][context[1]]:
            option_probs.append(lm.trigram[context[0]][context[1]][word])
          # Back off to bigram.
          elif context[1] in lm.bigram and word in lm.bigram[context[1]]:
            option_probs.append(lm.bigram[context[1]][word])
          # Back off to unigram.
          elif word in lm.unigram:
            option_probs.append(lm.unigram[word])
          # Else 0.
          else:
            option_probs.append(0)
      index = option_probs.index(max(option_probs))
      return self.num2letter[index]


    def quad_gram(self, context_dir="left"):
      """
      Return position of word-group with greatest trigram probability. 0 otherwise. 
      """
      option_probs = []
      context = self.get_context(3, method=context_dir)
      con_len = len(context)
      for word in self.options:
        if con_len == 3 and context[0] in lm.quad_gram and context[1] in lm.quad_gram[context[0]] and context[2] in lm.quad_gram[context[0]][context[1]] and word in lm.quad_gram[context[0]][context[1]][context[2]]:
          option_probs.append(lm.quad_gram[context[0]][context[1]][context[2]][word])
        # Back off to trigram.
        elif con_len == 2 and context[1] in lm.trigram and context[2] in lm.trigram[context[1]] and word in lm.trigram[context[1]][context[2]]:
          option_probs.append(lm.trigram[context[1]][context[2]][word])
        # Back off to bigram.
        elif con_len == 1 and context[2] in lm.bigram and word in lm.bigram[context[2]]:
          option_probs.append(lm.bigram[context[2]][word])
        # Back off to unigram.
        elif con_len == 0 and word in lm.unigram:
          option_probs.append(lm.unigram[word])
        # Else 0.
        else:
          option_probs.append(0)
      index = option_probs.index(max(option_probs))
      return self.num2letter[index]
    
    
    def predict(self, method="chooseA", additional_args=None):
        if method=="chooseA":
          return self.chooseA()
        elif method=="random":
          return self.random()
        elif method=="unigram":
          return self.unigram()
        elif method=="bigram":
          return self.bigram()
        elif method=="trigram":
          return self.trigram()
        elif method=="quad_gram":
          return self.quad_gram()


    def predict_and_score(self, method="chooseA", additional_args=None):
        #compare prediction according to method with the correct answer
        #return 1 or 0 accordingly
        prediction=self.predict(method=method, additional_args=additional_args)
        if prediction == self.answer:
            return 1
        else:
            return 0

In [None]:
class scc_reader:
    

    def __init__(self,qs=questions,ans=answers):
        self.qs=qs
        self.ans=ans
        self.read_files()


    def read_files(self):
        
        #read in the question file
        with open(self.qs) as instream:
            csvreader=csv.reader(instream)
            qlines=list(csvreader)
        
        #store the column names as a reverse index so they can be used to reference parts of the question
        question.colnames={item:i for i,item in enumerate(qlines[0])}
        
        #create a question instance for each line of the file (other than heading line)
        self.questions=[question(qline) for qline in qlines[1:]]
        
        #read in the answer file
        with open(self.ans) as instream:
            csvreader=csv.reader(instream)
            alines=list(csvreader)
            
        #add answers to questions so predictions can be checked    
        for q,aline in zip(self.questions,alines[1:]):
            q.add_answer(aline)


    def get_field(self,field):
        return [q.get_field(field) for q in self.questions] 
    

    def predict(self,method="chooseA"):
        return [q.predict(method=method) for q in self.questions]
    
    def predict_and_score(self,method="chooseA", additional_args=None):
        scores=[q.predict_and_score(method=method, additional_args=additional_args) for q in self.questions]
        return sum(scores)/len(scores)

SCC = scc_reader(questions, answers)

## Evaluation

## No Augmentation

In [None]:
methods = [
           "unigram", "bigram", "trigram", "quad_gram",
           ]

skipped = ["GloVe_cos", "GloVe_euc",
           "spacy", "spacy_no_stopwords"
           ]

df = pd.DataFrame({method: SCC.predict_and_score(method) for method in methods if method not in skipped}.items(), columns=["Method", "Score"])
df

NameError: ignored