In [1]:
!pip install -U nltk
!pip install pytrec-eval-terrier

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Downloading regex-2022.3.2-cp38-cp38-win_amd64.whl (274 kB)
Installing collected packages: regex, nltk
  Attempting uninstall: regex
    Found existing installation: regex 2021.4.4
    Uninstalling regex-2021.4.4:
      Successfully uninstalled regex-2021.4.4
  Attempting uninstall: nltk
    Found existing installation: nltk 3.6.1
    Uninstalling nltk-3.6.1:
      Successfully uninstalled nltk-3.6.1
Successfully installed nltk-3.7 regex-2022.3.2


In [2]:
import nltk
from nltk.corpus import brown
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline
from collections import Counter
from nltk.lm import MLE, Laplace, KneserNeyInterpolated
import pickle
from os import linesep
import json
import collections
import re
from collections import OrderedDict
from operator import itemgetter
import pytrec_eval


In [21]:
# Some interesting Corpus statistics
from nltk import word_tokenize, sent_tokenize 
from nltk.tokenize.treebank import TreebankWordDetokenizer

corpus = brown.words(categories='news')
reconstructedSentence = TreebankWordDetokenizer().detokenize(corpus)

sents = sent_tokenize(reconstructedSentence)
print("The number of sentences is", len(sents)) 

words = word_tokenize(reconstructedSentence)
print("The number of tokens is", len(words)) 

avg_tokens = round(len(words)/len(sents))
print("The average number of tokens per sentence is", avg_tokens) 

unique_tokens = set(words)
print("The number of unique tokens are", len(unique_tokens)) 


The number of sentences is 4144
The number of tokens is 101875
The average number of tokens per sentence is 25
The number of unique tokens are 13908


In [22]:
from collections import Counter
from nltk.lm import MLE, Laplace, KneserNeyInterpolated

# train model using Laplace smoothing
def train_model(n, corpus):

    train_, vocab_ = padded_everygram_pipeline(n, corpus)

    lm = Laplace(n)
    lm.fit(train_, vocab_)

    save_model(lm, n)
    return lm

In [23]:
# save model for future reference

def save_model(lm, n):
  with open('%s_gram_model.pkl' % n, 'wb') as fout:
    pickle.dump(lm, fout)

In [14]:
# clean and load birbeck corpus

def load_birbeck_corpus():

  birbeck_list = dict()
  f = open("C:\\Users\\vanam\\Downloads\\ota_20.500.12024_0643\\0643\\APPLING1DAT.643", "r")
  content = f.readlines()

  for line in content:
    if(line.startswith('$')):
        continue
    
    pair = line.strip().split()
    correct_word = pair[1].strip()
    sentence = ' '.join(pair[2:]).split('*')[0].strip()

    # print(sentence, " : ", correct_word)
    birbeck_list[sentence] = correct_word
  return birbeck_list
 

In [46]:
# score the probability of a word in the given context 

def score(n, vocabulary, birbeck_list):

  with open('%s_gram_model.pkl' % n, 'rb') as fin:
    lm = pickle.load(fin)

  n_prob = collections.defaultdict(dict)
  for line in birbeck_list.keys():
    for word in vocabulary:
        
      temp = line.split()[-(n-1):]
      if(len(temp) < n-1 ):
        for i in range(n-len(temp)-1):
            temp.insert(0,'<s>')

      score = lm.score(word, temp)
      n_prob[line][word] = score
    n_prob[line] = dict(sorted(n_prob[line].items(), key=lambda item: item[1], reverse = True)[:10])


  with open('score_%s_gram.json' % n, 'w') as fp:
    json.dump(n_prob, fp, indent=2)
  return n_prob

In [7]:
import re

# create dictionary from unique tokens in brown corpus
def clean_dict(dictionary):
  len(dictionary)
  vocab = []
  regexp = re.compile('^[^a-zA-Z]+')
  for word in dictionary:
    # if(word.startswith("[a-zA-Z]")):
    if ( regexp.search(word) or len(word) == 1 ):
      continue
    vocab.append(word.lower())

  return vocab

In [8]:
# Creating dictionaries to store top k results k = {1, 5, 10} respectively and write to files in system
from collections import OrderedDict
from operator import itemgetter


def sorted_dict_elements_evaluate(correct_dict, prob_n_gram, n):
  
    sort_dict_10 = collections.defaultdict(dict)
    sort_dict_5 = collections.defaultdict(dict)
    sort_dict_1 = collections.defaultdict(dict)

    for i in prob_n_gram:
        sort_dict_10[i] =  (OrderedDict(sorted( prob_n_gram[i].items(), key=itemgetter(1))[:10]))
        sort_dict_5[i] =  (OrderedDict(sorted( prob_n_gram[i].items(), key=itemgetter(1))[:5]))
        sort_dict_1[i] =  (OrderedDict(sorted( prob_n_gram[i].items(), key=itemgetter(1))[:1]))
    with open('sort_dict_%s.json' % n, 'w') as fp:
        json.dump(sort_dict_10, fp, indent=2)
        json.dump(sort_dict_5, fp, indent=2)
        json.dump(sort_dict_1, fp, indent=2)
        
        
    evaluate(correct_dict, sort_dict_10, sort_dict_5, sort_dict_1, n )


In [9]:
# create dictionary with correct results from birkbeck corpus
def calculate_correct_dict(birbeck_list):
    correct_dict = collections.defaultdict(dict)
    for item in birbeck_list.keys():
        correct_dict[item] = {birbeck_list[item] : 1}
    return correct_dict

In [28]:
# Calculating average using pytrec_eval 

import pytrec_eval

def evaluate(correct_dict, sort_dict_10, sort_dict_5, sort_dict_1, n ):
  
  results = collections.defaultdict(dict)

  for item in correct_dict:
      results[item] = {}
      
      if(list(correct_dict[item].keys())[0] in sort_dict_1[item].keys()):
          results[item][list(correct_dict[item])[0]] = 1
      
      for k in list(sort_dict_5[item].keys()):
          if( k not in results[item].keys()):
              results[item][k] = 1/5
              
      for k in list(sort_dict_10[item].keys()):
          if( k not in results[item].keys()):
              results[item][k] = 1/10  

  evaluator = pytrec_eval.RelevanceEvaluator(correct_dict, {'success'})
  res = evaluator.evaluate(results)
  
  print()
  print("\n********Results for {}-gram-model *********".format(n))
  for measure in sorted(list(res[list(res.keys())[0]].keys())):
        print('average', measure, ': ', pytrec_eval.compute_aggregated_measure(measure, [query_measures[measure] for query_measures in res.values()]))




In [18]:
# aggregate method to train and score  
def train_calculate(n, corpus, vocabulary, birbeck_list):
 
  train_model(n, corpus)
  prob_n_gram = score(n, vocabulary, birbeck_list)
 
  return prob_n_gram


In [None]:
nltk.download('brown')
nltk.download('punkt')

In [48]:
import nltk


tokens = brown.words(categories='news')
vocabulary = clean_dict(set(tokens))

birbeck_list = load_birbeck_corpus()
# print(birbeck_list)

corpus = brown.sents(categories='news')
correct_dict = calculate_correct_dict(birbeck_list)

# print(prob_3_gram)

n = 1
prob_n_gram = train_calculate(n, corpus, vocabulary, birbeck_list)
sorted_dict_elements_evaluate(correct_dict, prob_n_gram, n)

n = 2
prob_n_gram = train_calculate(n, corpus, vocabulary, birbeck_list)
sorted_dict_elements_evaluate(correct_dict, prob_n_gram, n)

n = 3
prob_n_gram = train_calculate(n, corpus, vocabulary, birbeck_list)
sorted_dict_elements_evaluate(correct_dict, prob_n_gram, n)

n = 5
prob_n_gram = train_calculate(n, corpus, vocabulary, birbeck_list)
sorted_dict_elements_evaluate(correct_dict, prob_n_gram, n)

n = 10
prob_n_gram = train_calculate(n, corpus, vocabulary, birbeck_list)
sorted_dict_elements_evaluate(correct_dict, prob_n_gram, n)






********Results for 1-gram-model *********
average success_1 :  0.0
average success_10 :  0.0
average success_5 :  0.0


********Results for 2-gram-model *********
average success_1 :  0.0
average success_10 :  0.007407407407407408
average success_5 :  0.0


********Results for 3-gram-model *********
average success_1 :  0.0
average success_10 :  0.022222222222222223
average success_5 :  0.0


********Results for 5-gram-model *********
average success_1 :  0.0
average success_10 :  0.007407407407407408
average success_5 :  0.0


********Results for 10-gram-model *********
average success_1 :  0.0
average success_10 :  0.007407407407407408
average success_5 :  0.0


In [57]:
print("Example of probability distribution for 10-gram model:\n")
print(list(prob_n_gram.keys())[10], " : ", prob_n_gram[list(prob_n_gram.keys())[10]] )

Example of probability distribution for 10-gram model:

I  :  {'had': 0.0002772579191793166, 'think': 0.0002772579191793166, "can't": 0.0002079434393844874, 'am': 0.0002079434393844874, 'never': 0.0002079434393844874, 'can': 0.0001386289595896583, 'hope': 0.0001386289595896583, 'cannot': 0.0001386289595896583, 'could': 0.0001386289595896583, 'told': 0.0001386289595896583}
