<a href="https://colab.research.google.com/github/Moksha97/n_grams/blob/main/nlp_assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import statements
import re
import numpy as np

In [2]:
# Preprocessing
def preprocessing(path):
  splitted_text = []
  updated_para = []
  new_tokens = []
  cleaned_tokens = []
  prev_token = ""
  res_tokens = []
  with open(path, 'r') as file:
    # split the text based on period and append the <s> and </s> tokens
    # respectively.
    for line in file:
      line = line.strip()
      line = "<s> "+ line + " </s>"
      updated_para.append(line)

    # based on new tokens created the updated paragraph
    for para in updated_para:
      splitted_text = re.split(r'[ .]', para)
      splitted_text = [s for s in splitted_text if s]
      for token in splitted_text:
        new_tokens.append(token)

    # if the start<s> and end </s> tokens are side by side with no tokens
    # in between remove unnecessary tokens

    for token in new_tokens:
      if token == "</s>" and prev_token == "<s>":
        cleaned_tokens.pop()
      else:
        if token == '<s>' or token == '</s>':
          cleaned_tokens.append(token)
        else:
          if re.match("^[a-zA-Z]+$", token):
            cleaned_tokens.append(token)
            prev_token = token
    # add the tokens if it is of text or start or stop only
    for token in cleaned_tokens:
      if token == '<s>' or token == '</s>' or re.match("^[a-zA-Z]+$", token):
        res_tokens.append(token.lower())
  return res_tokens

In [3]:
# get N- grams
def get_N_Grams(tokens, n, unknown_tok = None):
  length = len(tokens)
  n_grams = []
  if n==1:
    for index in range(length-n+1):
      temp = tuple(tokens[j] for j in range(index, index + n))
      n_grams.append(temp)
  if n == 2 and unknown_tok is not None:
    # Handle bigrams with unknown token replacement
    for index in range(length-n+1):
      token1 = tokens[index]
      token2 = tokens[index + 1]

      # Replace unknown tokens with a specified value
      if token1 in unknown_tok:
          token1 = 'UNK'
      if token2 in unknown_tok:
          token2 = 'UNK'

      bigram = (token1, token2)
      n_grams.append(bigram)
  return n_grams

In [4]:
# get unigram frequncies
def getUnigramFreq(uni_gram_data, cap):
    unigram_token_freq = {}
    unigram_token_freq['UNK'] = 0
    unknown_set = []
    for tokens in uni_gram_data:
      for token in tokens:
        if token in unigram_token_freq:
          unigram_token_freq[token] += 1
        else:
          unigram_token_freq[token] = 1

    for token, freq in unigram_token_freq.items():
      if freq <= cap:
        unknown_set.append(token)
        unigram_token_freq['UNK'] += freq
    return unigram_token_freq, unknown_set

In [5]:
# get bi gram frequencies
def getBigramFreq(bi_gram_data):
  bigram_token_freq = {}
  for token in bi_gram_data:
    if token in bigram_token_freq:
      bigram_token_freq[token] += 1
    else:
      bigram_token_freq[token] = 1
  return bigram_token_freq

In [6]:
# Calculate 'N' except start token
def getLength(uni_gram_freq = {}):
  len = 0
  for token,count in uni_gram_freq.items():
    if token == '<s>':
      continue
    len += count
  return len

In [7]:
# get probabilities with or without smoothing
def getProbabilities(n, k, vocabSize, isSmoothing, uni_gram_freq = {}, bi_gram_freq = {}):
    token_prob = {}
    if not isSmoothing:
      if n==1:
        for token,count in uni_gram_freq.items():
          token_prob[token] = count/vocabSize
        return token_prob
      elif n==2:
        for token,count in bi_gram_freq.items():
          token_prob[token] = count/uni_gram_freq[token[0]]
        return token_prob
    else:
      if n == 1:
        for token,count in uni_gram_freq.items():
          token_prob[token] = ((count + k)/ (getLength(uni_gram_freq) + (k*vocabSize)))
        return token_prob
      if n == 2:
        for token,count in bi_gram_freq.items():
          token_prob[token] = ((count + k)/ (uni_gram_freq[token[0]] + (k*vocabSize)))
        return token_prob

In [8]:
# get unigram probability of test data
def getTestUnigramProb(test_data, train_prob):
  test_uni_prob = {}
  for tokens in test_data:
    for token in tokens:
      if token in train_prob:
        test_uni_prob[token] = train_prob[token]
      else:
        test_uni_prob[token] = train_prob['UNK']
  return test_uni_prob

In [9]:
# get bigram probability of test data
def getTestBiGramProb(test_data, train_prob, unknown_tok):
  test_bi_prob = {}
  for tokens in test_data:
    token1, token2 = tokens
    token_1 = token1
    token_2 = token2
    if token1 in unknown_tok:
      token_1 = 'UNK'
    if token2 in unknown_tok:
      token_2 = 'UNK'
    # Check if the bigram exists in train_bigram_prob, otherwise use 'UNK' probability
    if (token_1, token_2) in train_prob:
        test_bi_prob[(token1, token2)] = train_prob[(token_1, token_2)]
    else:
        test_bi_prob[(token1, token2)] = train_prob.get(('UNK', 'UNK'), 0.0)
  return test_bi_prob

In [10]:
# calcalutes perplexity
def calcPerplexity(prob, n):
  logsum = 0
  for val in prob.values():
    logsum += np.log(val)
  return np.exp(-logsum/n)

In [11]:
train_file_path = "./train.txt"
training_data = preprocessing(train_file_path)
val_file_path = "./val.txt"
val_data = preprocessing(val_file_path)

In [12]:
train_uni_gram = get_N_Grams(training_data, 1)
unigram_token_freq, unknown_tokens = getUnigramFreq(train_uni_gram,1)

In [13]:
train_bi_gram = get_N_Grams(training_data, 2, unknown_tokens)
bigram_token_freq = getBigramFreq(train_bi_gram)

In [14]:
train_unigram_prob = getProbabilities(1,0,len(train_uni_gram), False, unigram_token_freq, {})
train_bigram_prob = getProbabilities(2,0,len(train_uni_gram), False, unigram_token_freq, bigram_token_freq)

In [15]:
test_uni_grams = get_N_Grams(val_data, 1)
test_bi_grams = get_N_Grams(val_data, 2, unknown_tokens)

In [16]:
test_uni_prob = getTestUnigramProb(test_uni_grams, train_unigram_prob)

In [17]:
test_bi_prob = getTestBiGramProb(test_bi_grams,train_bigram_prob, unknown_tokens)

In [18]:
test_unigram_perplexity = calcPerplexity(test_uni_prob, len(test_uni_prob))
print("Validation Set Perplexity:")
print(f'The perplexity of unigram with unknown word handling is {test_unigram_perplexity}')
test_bigram_perplexity = calcPerplexity(test_bi_prob, len(test_bi_prob))
print(f'The perplexity of bigram with unknown word handling is {test_bigram_perplexity}')

Validation Set Perplexity:
The perplexity of unigram with unknown word handling is 3412.237774835694
The perplexity of bigram with unknown word handling is 30.273961210567073


In [19]:
# laplace smoothing - 1
train_bigram_laplace_prob = getProbabilities(2,1,len(train_uni_gram), True, unigram_token_freq, bigram_token_freq)
train_unigram_laplace_prob = getProbabilities(1,1,len(train_uni_gram), True, unigram_token_freq, bigram_token_freq)

In [20]:
test_uni_prob_Laplace = getTestUnigramProb(test_uni_grams, train_unigram_laplace_prob)

In [21]:
test_bi_prob_Laplace = getTestBiGramProb(test_bi_grams, train_bigram_laplace_prob, unknown_tokens)

In [22]:
test_unigram_perplexity_laplace = calcPerplexity(test_uni_prob_Laplace, len(test_uni_prob_Laplace))
test_bigram_perplexity_laplace = calcPerplexity(test_bi_prob_Laplace, len(test_bi_prob_Laplace))
print("Validation Set Perplexity:")
print(f'The perplexity of unigram with laplace smoothing is {test_unigram_perplexity_laplace}')
print(f'The perplexity of bigram with laplace smoothing is {test_bigram_perplexity_laplace}')

Validation Set Perplexity:
The perplexity of unigram with laplace smoothing is 5875.292572274528
The perplexity of bigram with laplace smoothing is 2922.721304257471


In [23]:
# K-smoothing
k_Smooth = [0.01, 0.05, 0.1, 0.5, 0.75]
k_Smoothing_bigram = []
for k in range(len(k_Smooth)):
  k_Smoothing_bigram.append(getProbabilities(2,k_Smooth[k],len(train_uni_gram), True, unigram_token_freq, bigram_token_freq))

test_bigram_prob_ksmooth = []
for ksm in range(len(k_Smoothing_bigram)):
  test_bigram_prob_ksmooth.append(getTestBiGramProb(test_bi_grams, k_Smoothing_bigram[ksm], unknown_tokens))
print("Validation Set Perplexity:")
for ksm in range(len(test_bigram_prob_ksmooth)):
  test_k_smooth_perplexity = None
  test_k_smooth_perplexity = calcPerplexity(test_bigram_prob_ksmooth[ksm],len(test_bigram_prob_ksmooth[ksm]))
  print(f'The perplexity of bigram with k-smoothing of k = {k_Smooth[ksm]} is {test_k_smooth_perplexity}')

Validation Set Perplexity:
The perplexity of bigram with k-smoothing of k = 0.01 is 90.05990983589889
The perplexity of bigram with k-smoothing of k = 0.05 is 234.85621316673965
The perplexity of bigram with k-smoothing of k = 0.1 is 401.47874343840914
The perplexity of bigram with k-smoothing of k = 0.5 is 1609.247564015122
The perplexity of bigram with k-smoothing of k = 0.75 is 2288.062284588195


In [24]:
print("Training Set Perplexity:")
train_unigram_perplexity = calcPerplexity(train_unigram_prob, len(train_unigram_prob))
print(f'The perplexity of unigram with unknown word handling is {train_unigram_perplexity}')
train_bigram_perplexity = calcPerplexity(train_bigram_prob, len(train_bigram_prob))
print(f'The perplexity of bigram with unknown word handling is {train_bigram_perplexity}')

Training Set Perplexity:
The perplexity of unigram with unknown word handling is 30040.634286141903
The perplexity of bigram with unknown word handling is 50.42475001455599
