In [None]:
# Importing the required libraries
import string
import random
import nltk
import time
import json
from nltk.tokenize import RegexpTokenizer
import numpy as np

nltk.download('punkt')

In [None]:
# Mounting the gdrive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Opening training and validation files.
file_path = "/content/gdrive/My Drive/CBT LM Dataset/train.txt"
json_path = "/content/gdrive/My Drive/CBT LM Dataset/validation.jsonl"
comat_path = "/content/blank.csv"

file = open(file_path,'r')
text = file.read()
file.close()


In [None]:
# Splitting the training files into sentences.
sent = []
for line in text.split('\n'):
  sent.append(line.translate(str.maketrans('', '', string.punctuation)).lower())

In [None]:
# All bigram models
class bigramModel(object):
  def __init__(self):
    self.context = {}

    # Co occurance matrix
    self.bigram_counter = {}

    # for the number of unique words
    self.vocab = set()

  # Added the function inside class
  def calculateBigrams(self, tokens):
    tokens = ['<START>'] + tokens
    bigrams = [(tokens[i-1], tokens[i]) for i in range(1, len(tokens))]
    return bigrams


  # adding each sentence in the bigram model
  def update(self, sentence):
    
    # tokenizing the sentence
    tokenizer = RegexpTokenizer(r"\w+")

    # calculating the bigrams
    bigrams = self.calculateBigrams(tokenize(sentence, tokenizer))

    for bigram in bigrams:
      prevWord, targetWord = bigram

      # add the words in the vocab
      self.vocab.add(prevWord)
      self.vocab.add(targetWord)

      # update in the co-occurance matrix (dictionary)
      if prevWord in (self.bigram_counter.keys()):
        if bigram in (self.bigram_counter[prevWord].keys()):
          self.bigram_counter[prevWord][bigram] += 1.0
        else:
          self.bigram_counter[prevWord][bigram] = 1.0
      else:
        self.bigram_counter[prevWord] = {}
        self.bigram_counter[prevWord][bigram] = 1.0
      
      if prevWord in self.context:
        self.context[prevWord].append(targetWord)
      else:
        self.context[prevWord] = [targetWord]


In [None]:
# Return the word with maximum probabilty without smoothing
def probabilitySimple(model, word, options):
  maxi = -1
  maxwd = ""
  try:

    # All the bigrams for the prevWord
    countWord = model.bigram_counter[word]
    sumWord = 0

    # total count of occurance for prevWord
    for count_x in countWord.values():
      sumWord += count_x
    
    # For all options in the list
    for option in options:
      try:

        # check the probability
        if countWord[(word, option)] / sumWord > maxi:
          maxi = countWord[(word, option)] / sumWord
          maxwd = option
      except KeyError:
        pass
  except KeyError:
    pass
  return maxwd

In [None]:
# Add 1 smoothing
def probibilityLaplace(model, context, token):
  try:

    # Add the count(word) + count(unique words in vocabulary)
    count_of_context = float(len(model.context[context]) + len(model.vocab)) 
    try:

      # add 1 for all words
      count_of_token = model.bigram_counter[context][(context, token)] + 1
      result = count_of_token / count_of_context

    except KeyError:

      # word is not present in the training set
      result = 1 / count_of_context
    return result
  except:
    return -1


In [None]:
# Add K smoothing. We need to provide k using the dev set (0 < k < 1)
def probibilityAddK(model, context, token, k):
  try:

    # Add the count(word) + count(unique words in vocabulary)
    count_of_context = float(len(model.context[context]) + len(model.vocab)) + k*(len(model.vocab))
    try:

      # add k for all words
      count_of_token = model.bigram_counter[context][(context, token)] + k
      result = count_of_token / count_of_context

    except KeyError:

      # word not present in the training data
      result = k / count_of_context 
    return result
  except:
    return -1

In [None]:
def callbigramModel(sentences):
    # Make a model
    model = bigramModel()

    # add each sentence in the training data
    for sentence in sentences:
      model.update(sentence)
    return model

# Tokenize the sentence.
def tokenize(sentence, tokenizer):
  sentence = sentence.lower()
  return tokenizer.tokenize(sentence)

In [None]:
# check the previous word and call the required function
def returnMaskedWord(model, sentence, options, smoothing = "None"):
  prevs = ['<START>']

  # Split the sentence and check all the words
  for word in sentence.split():
    if word == 'XXXXX':

      # check against the options
      ans = findMaskedWord(model, prevs[-1], options, smoothing)
      if ans == "":
        ans = options[-1]
      return ans
    prevs.append(word)

  # return the last word if the probability is 0 for all
  return options[-1]

def findMaskedWord(mod, word, options, smoothing):
  maxi = 0
  maxwd = ""

  # Check the smoothing option
  for option in options:
    if smoothing == "AddK":
      probX = probibilityAddK(model, word, option, 0.2) #Provide the appropriete value of k.

    elif smoothing == "Laplace":
      probX = probibilityLaplace(model, word, option)
    
    else:
      return probabilitySimple(model, word, options)

    # If the probability is maximum, then return the word
    if probX > maxi:
      maxi = probX
      maxwd = option

  return maxwd

Calling the Model

In [None]:
start = time.time()

# Training the model
model = callbigramModel(sent)
print("Bigram model Trained")
print ("Time taken to create the model:", {time.time() - start})

In [None]:
# Adding the testing data
test_sents = []
for line in open(json_path, 'r'):
  test_sents.append(json.loads(line))

Accuracy of The Models

In [None]:
# File printing function
def printToFile(f, sentence, testops, pred):
  f.write(sentence)
  f.write("\n")
  f.write(str(testops))
  f.write('\n')
  f.write(pred)
  f.write('\n')
  

In [None]:
# Calculate the accuracy for the without smoothing method
# f = open("output1.txt", "w")
correctWord = 0
totalWords = 0

for test in test_sents:
  
  sentence = test['question'].translate(str.maketrans('', '', string.punctuation))
  pred = returnMaskedWord(model, sentence, test['options'], "None") #Predict using the smoothing

  # printToFile(f, sentence, test['options'], pred)

  if pred == test['answer']:
    correctWord += 1
  totalWords += 1
# f.close()
print("The accuracy of the bigram model wihout smoothing is: ", correctWord * 100 / totalWords)

In [None]:
# Calculate the accuracy for the Laplace method
# f = open("outputLaplace.txt", "w")
correctWord = 0
totalWords = 0

for test in test_sents:

  s = test['question'].translate(str.maketrans('', '', string.punctuation))
  pred = returnMaskedWord(model, s, test['options'], "Laplace")

  #printToFile(f, sentence, test['options'], pred)

  if pred == test['answer']:
    correctWord += 1
  totalWords += 1
# f.close()
print("The accuracy of the bigram model with Laplace smoothing is: ", correctWord * 100 / totalWords)

In [None]:
# Calculate the accuracy for the Add K method
# f = open("outputAddK.txt", "w")

correctWord = 0
totalWords = 0
for test in test_sents:

  s = test['question'].translate(str.maketrans('', '', string.punctuation))
  pred = returnMaskedWord(model, s, test['options'], "AddK")

  #printToFile(f, sentence, test['options'], pred)
  
  if pred == test['answer']:
    correctWord += 1
  totalWords += 1

# f.close()

print("The accuracy of the bigram model with Add k smoothing is: ", correctWord * 100 / totalWords)

### Bonus

In [None]:
# Without smoothing
def probabilitySimpleFuture(model, prevWord, nextWord, options):
  maxi = -1
  maxwd = ""
  
  try:

    # Calculate the count of prevWord
    countWord = model.bigram_counter[prevWord]
    sumX = 0
    for x in countWord.values():
      sumX += x

    for opt in options:
      try:

        # Probabilty of option given word
        wordProb = (countWord[(prevWord,opt)] / sumX)

        # Probability of next word given option
        if (nextWord != "Null"):
          countOption = model.bigram_counter[opt]
          sumOpt = 0
          for x in countOption.values():
            sumOpt += x

          wordProb = (countWord[(prevWord,opt)] / sumX) * (countOption[(opt, nextWord)] / sumOpt) if sumOpt > 0 else (countWord[(prevWord , opt)] / sumX)

        # Word with maximum probability
        if wordProb > maxi:
          maxi = wordProb
          maxwd = opt
      except KeyError:
        pass
  except KeyError:
    pass
  return maxwd

In [None]:
# Return the masked word choice using the smoothing option
def findMaskedWordFuture(model, word, nextWord, options, smoothing):
  maxi = 0
  maxwd = ""
  for opt in options:

    # Add k
    if smoothing == "AddK":
      x = probibilityAddK(model, word, opt, 0.00001)

      # Probability with the next word
      y = probibilityAddK(model, opt, nextWord, 0.00001)

    elif smoothing =="Laplace":
      x = probibilityLaplace(model, word, opt) 
      y = probibilityLaplace(model, opt, nextWord)

    else:
      return probabilitySimpleFuture(model, word, nextWord, options)

    if x*y > maxi:
      maxi = x*y
      maxwd = opt

  # Return the word
  return maxwd


In [None]:
def returnMaskedWordFuture(model, sentence, options, smoothing):
  prevs = ['<START>']

  # Split the sentence
  tokens = sentence.split()

  # check the masked word
  for word in tokens:
    if word == 'XXXXX':
      futureContextindex  = tokens.index(word)
      futureContext = tokens[futureContextindex + 1] if (futureContextindex < len(tokens) - 1) else "Null"
      
      # check the smoothing method
      ans = findMaskedWordFuture(model, prevs[-1], futureContext, options, smoothing)
      
      if ans == "":
        ans = options[-1]
      return ans
    prevs.append(word)

  # Return the last word in case of no solution
  return options[-1]


In [None]:
# No smoothing
# f = open("NoSmoothingBothSides.txt", "w")
correctWord = 0
totalWords = 0

for test in test_sents:

  s = test['question'].translate(str.maketrans('', '', string.punctuation))
  pred = returnMaskedWordFuture(model, s, test['options'], "None")

  #printToFile(f, sentence, test['options'], pred)

  if pred == test['answer']:
    correctWord += 1
  totalWords += 1

# f.close()  

print("The accuracy of model without smoothing: ", correctWord * 100 / totalWords)

In [None]:
# Laplace Method
# f = open("LaplaceBothSides.txt", "w")
correctWord = 0
totalWords = 0

for test in test_sents:

  s = test['question'].translate(str.maketrans('', '', string.punctuation))
  pred = returnMaskedWordFuture(model, s, test['options'], "Laplace")

  #printToFile(f, sentence, test['options'], pred)

  if pred == test['answer']:
    correctWord += 1
  totalWords += 1

# f.close()    

print("The accuracy of model with Laplace Method: ", correctWord*100/totalWords)

In [None]:
# Add K method
# f = open("AddKBothSides.txt", "w")

correctWord = 0
totalWords = 0
for test in test_sents:

  s = test['question'].translate(str.maketrans('', '', string.punctuation))
  pred = returnMaskedWordFuture(model, s, test['options'], "AddK")

  #printToFile(f, sentence, test['options'], pred)

  if pred == test['answer']:
    correctWord += 1

  totalWords += 1

# f.close()    

print("The accuracy of model with Add k method: ", correctWord * 100 / totalWords)

### Instance of Co-occurence matrix

In [None]:
def tokenize(sentence, tokenizer):
  sentence = sentence.lower()
  return tokenizer.tokenize(sentence)
  
def calculateBigrams(tokens):
    n = 2
    tokens = 1*['<START>']+tokens
    l = [(tokens[i-1], tokens[i]) for i in range(1, len(tokens))]
    return l

def callbigramModel(sentences):
    model = bigramModel()
    for sentence in sent:
      model.update(sentence)
    return model

In [None]:
# simple bigram model
class bigramModel2(object):
  def __init__(self):
    self.context = {}
    self.bigram_counter = {}
    self.wd_to_idx = {}
    self.ct=0


  def update(self, sentence):
    tokenizer = RegexpTokenizer(r"\w+")
    bigrams = calculateBigrams(tokenize(sentence, tokenizer))
    for bigram in bigrams:
      prevWord, targetWord = bigram
      if prevWord in (self.bigram_counter.keys()):

        if bigram in (self.bigram_counter[prevWord].keys()):
          self.bigram_counter[prevWord][bigram] += 1.0
        else:
          self.bigram_counter[prevWord][bigram] = 1.0
      else:
        self.bigram_counter[prevWord]={}
        self.bigram_counter[prevWord][bigram] = 1.0
      
      prevWord, targetWord = bigram
      if prevWord in self.context:
        self.context[prevWord].append(targetWord)
      else:
        self.context[prevWord] = [targetWord]
      if prevWord not in self.wd_to_idx.keys():
        self.wd_to_idx[prevWord]=self.ct
        self.ct+=1
      if targetWord not in self.wd_to_idx.keys():
        self.wd_to_idx[targetWord]=self.ct
        self.ct+=1

  
  def prob(self, context, token):
      try:
        count_of_token = self.bigram_counter[(context, token)]
        count_of_context = float(len(self.context[context]))
        result = count_of_token / count_of_context

      except KeyError:
        result = 0.0
      return result
  

In [None]:
# Co occurance matrix
# Uses a lot of space
def co_mat(mod):
  vocab_size = len(mod.wd_to_idx)
  file = open(comat_path, 'w')

  for word,idx in mod.wd_to_idx.items():
    comat = np.zeros(vocab_size)
    try:
      for val in set(mod.context[word]):
        j = mod.wd_to_idx[val]
        comat[j]+=1
    except KeyError:
      pass

    sum_vec = np.sum(comat)

    if sum_vec != 0:
      comat = comat/sum_vec

    np.savetxt(file,comat, newline=", ")
    file.write("\n")

  file.close()

co_mat(mod)