In [43]:
# import required module
import os
import xml.etree.ElementTree as ET
import spacy
import nltk
import time
import random


#if needed
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# parse the XML document
tree = ET.parse(r"British baby corpus\fic\AB9.xml")
root = tree.getroot()
docsFiles = []

# find the <wtext> element
wtext_element = root.find('.//wtext')

# iterate over the <s> elements and extract the text
for s_element in wtext_element.iter('s'):
    sentence_text = ''
    for w_element in s_element.iter('w'):
        sentence_text += w_element.text + ' '
    #print(sentence_text.strip())
    docsFiles.append(sentence_text.strip())


In [4]:
print(docsFiles[0])

Detective  Chief  Inspector  John  McLeish  gazed  doubtfully  at  the  plate  before  him


In [5]:
#Tokenisation
# Splitting each element(each element represent a file) into multiple elements
token =[]
for x in docsFiles:
    token.append(x.split())
    
#print(token)

In [6]:
#Performing casefold

casefold = []
temp = []

#iterating throught the 2d array and converting each token in a lower case
for x in range(len(token)):
    for t in token[x]:
        temp.append(t.casefold())
        
    casefold.append(temp)
    temp = []

print(casefold[0])

['detective', 'chief', 'inspector', 'john', 'mcleish', 'gazed', 'doubtfully', 'at', 'the', 'plate', 'before', 'him']


In [7]:
#Performing stop words removal

stop_words = set(stopwords.words('english'))


stopWordRemoval = []
temp2 = []
 
# iterating through the lower case tokens and removing unimportant words (that are generated and stored in stop_words) 
#like 'a', 'the', etc...
for x in range(len(casefold)):
    for w in casefold[x]:
        if w not in stop_words:
            temp2.append(w)
    
    stopWordRemoval.append(temp2)
    temp2 = []
        
print(stopWordRemoval[0])

['detective', 'chief', 'inspector', 'john', 'mcleish', 'gazed', 'doubtfully', 'plate']


In [8]:
#Performing stemming
ps = PorterStemmer()
stemming = []
temp3 = []
    
# iterating through the stopWordRemoval and editing the word into a more general manner for example 'running' to 'run'    
for x in range(len(stopWordRemoval)):
    for w in stopWordRemoval[x]:
        temp3.append(ps.stem(w))
    
    stemming.append(temp3)
    temp3 = []
    
print(stemming[0])

['detect', 'chief', 'inspector', 'john', 'mcleish', 'gaze', 'doubt', 'plate']


In [29]:
# hard coding the ngram#
def build_ngram_counts(text, n):
    ngram_counts = {}
    words = text.split()
    for i in range(len(words)-n+1):
        ngram = ' '.join(words[i:i+n])
        if ngram in ngram_counts:
            ngram_counts[ngram] += 1
        else:
            ngram_counts[ngram] = 1
    return ngram_counts

#def build_ngram_counts(text, n):
   # ngram_counts = {}
   # words = text.split()
  #  for i in range(len(words)-n+1):
 #       ngram = tuple(words[i:i+n])
#       # if n in ngram_counts:
      #      if ngram in ngram_counts[n]:
     #           ngram_counts[n][ngram] += 1
    #        else:
   #             ngram_counts[n][ngram] = 1
  #      else:
 #           ngram_counts[n] = {ngram: 1}
#    return ngram_counts



In [124]:
start_time = time.time()

my_str = ' '.join([' '.join(lst) for lst in stemming])
#print(my_str)

#nGram = build_ngram_counts(my_str, 3)

ngram_counts = {1: build_ngram_counts(my_str, 1), 2: build_ngram_counts(my_str, 2), 3: build_ngram_counts(my_str, 3)}
print(ngram_counts)

end_time = time.time()

print("Time taken(Hard coding): ", end_time - start_time, "seconds")

{1: {'detect': 10, 'chief': 18, 'inspector': 15, 'john': 47, 'mcleish': 302, 'gaze': 11, 'doubt': 5, 'plate': 6, 'thought': 79, 'hungri': 1, 'realiz': 29, 'actual': 15, 'need': 48, 'anyth': 21, 'rather': 32, 'overflow': 1, 'cholesterol': 1, 'canteen': 3, 'new': 33, 'scotland': 14, 'yard': 26, 'provid': 5, 'admir': 8, 'prompt': 2, 'sleep': 9, 'would': 171, 'perhap': 18, 'make': 49, 'sens': 9, 'thirty-six': 4, 'hour': 28, 'straight': 14, 'duti': 6, 'much': 60, 'spent': 10, 'sullen': 2, 'jamaican': 1, 'kill': 17, 'landladi': 1, 'three': 30, 'children': 18, 'crowd': 2, 'kitchen': 10, 'hous': 24, 'behind': 15, 'westway': 2, 'took': 35, 'experiment': 1, 'mouth': 13, 'fri': 2, 'egg': 3, 'wait': 41, 'see': 78, 'go': 150, 'suit': 18, 'progress': 1, 'bake': 2, 'bean': 3, 'cautious': 8, 'finish': 12, 'one': 134, 'sausag': 1, 'decid': 63, 'bread': 1, 'tempt': 1, 'fate': 1, 'push': 16, 'asid': 5, 'reach': 16, 'cup': 15, 'tea': 18, 'rest': 13, 'elbow': 5, 'tabl': 19, 'lift': 12, 'hand': 39, 'weari':

In [13]:
from nltk.util import ngrams
from collections import Counter

# coding n-gram with libraries
start_time = time.time()
my_str = ' '.join([' '.join(lst) for lst in stemming])

# Split text into tokens
tokens = my_str.split()

# Build n-grams using NLTK
unigrams = list(ngrams(tokens, 1))
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

# Count the frequency of each n-gram
unigram_freq = dict(Counter(unigrams))
bigram_freq = dict(Counter(bigrams))
trigram_freq = dict(Counter(trigrams))

# Print the frequency counts
#print("Unigram frequency:")
#print(unigram_freq)
#print("Bigram frequency:")
#print(bigram_freq)
#print("Trigram frequency:")
#print(trigram_freq)
end_time = time.time()
print("Time taken(With libraries): ", end_time - start_time, "seconds")

Time taken(With libraries):  0.010853052139282227 seconds


In [95]:
class VanillaLanguageModel:
    def __init__(self, ngram_counts, alpha=1):
        self.alpha = alpha
        self.probabilities = {}
        # unigram model
        self.probabilities[1] = {}
        for word, count in ngram_counts[1].items():
            self.probabilities[1][word] = (count + alpha) / (sum(ngram_counts[1].values()) + alpha * len(ngram_counts[1]))
        # bigram model
        self.probabilities[2] = {}
        for pair, count in ngram_counts[2].items():
            if len(pair) != 2:
                continue
            prefix, word = pair
            if prefix not in self.probabilities[2]:
                self.probabilities[2][prefix] = {}
            self.probabilities[2][prefix][word] = (count + alpha) / (sum([count for (prefix, word), count in ngram_counts[2].items() if prefix == pair[0]]) + alpha * len(ngram_counts[1]))
        # trigram model
        self.probabilities[3] = {}
        for trigram, count in ngram_counts[3].items():
            if len(trigram) != 3:
                continue
            prefix1, prefix2, word = trigram
            if (prefix1, prefix2) not in self.probabilities[3]:
                self.probabilities[3][(prefix1, prefix2)] = {}
            self.probabilities[3][(prefix1, prefix2)][word] = (count + alpha) / (sum([count for (prefix1, prefix2, word), count in ngram_counts[3].items() if (prefix1, prefix2) == trigram[:2]]) + alpha * len(ngram_counts[1]))
    
    def predict(self, context):
        context = context.split()
        if len(context) == 0:
            return self.probabilities[1]
        elif len(context) == 1:
            if context[0] in self.probabilities[2]:
                return self.probabilities[2][context[0]]
            else:
                return self.probabilities[1]
        else:
            if (context[-2], context[-1]) in self.probabilities[3]:
                return self.probabilities[3][(context[-2], context[-1])]
            elif context[-1] in self.probabilities[2]:
                return self.probabilities[2][context[-1]]
            else:
                return self.probabilities[1]
            
 	def generate_word(self, context):
        """
        Generate a word given a context.

        :param context: a string representing the context
        :return: a string representing the predicted word
        """
        # split the context into individual words
        context_words = context.split()
        # determine the length of the context
        context_length = len(context_words)

        # use the appropriate model based on the length of the context
        if context_length == 0:
            model = self.probabilities[1]
        elif context_length == 1:
            model = self.probabilities[2].get(context_words[-1], self.probabilities[1])
        else:
            model = self.probabilities[3].get((context_words[-2], context_words[-1]), self.probabilities[2].get(context_words[-1], self.probabilities[1]))

        # generate the predicted word with the highest probability
        predicted_word = max(model, key=model.get)

        return predicted_word



TabError: inconsistent use of tabs and spaces in indentation (1174059510.py, line 45)

In [99]:
class VanillaLanguageModel:
    def __init__(self, ngram_counts, alpha=1):
        self.probabilities = {}
        self.alpha = alpha
        
        # unigram model
        self.probabilities[1] = {}
        for word, count in ngram_counts[1].items():
            self.probabilities[1][word] = (count + alpha) / (sum(ngram_counts[1].values()) + alpha * len(ngram_counts[1]))

        # bigram model
        self.probabilities[2] = {}
        for prefix, count in ngram_counts[2].items():
            if prefix[0] not in self.probabilities[2]:
                self.probabilities[2][prefix[0]] = {}
            self.probabilities[2][prefix[0]][prefix[1]] = (count + alpha) / (sum([c for p, c in ngram_counts[2].items() if p[0] == prefix[0]]) + alpha * len(ngram_counts[1]))

        # trigram model
        self.probabilities[3] = {}
        for prefix, count in ngram_counts[3].items():
            if (prefix[0], prefix[1]) not in self.probabilities[3]:
                self.probabilities[3][(prefix[0], prefix[1])] = {}
            self.probabilities[3][(prefix[0], prefix[1])][prefix[2]] = (count + alpha) / (sum([c for p, c in ngram_counts[3].items() if p[0] == prefix[0] and p[1] == prefix[1]]) + alpha * len(ngram_counts[1]))

    def predict_next_word(self, context):
        words = context.split()
        if len(words) == 0:
            return None
        elif len(words) == 1:
            return max(self.probabilities[1], key=self.probabilities[1].get)
        elif len(words) == 2:
            prefix = (words[0], words[1])
            if prefix not in self.probabilities[2]:
                return max(self.probabilities[1], key=self.probabilities[1].get)
            return max(self.probabilities[2][prefix], key=self.probabilities[2][prefix].get)
        else:
            prefix = (words[-2], words[-1])
            if prefix not in self.probabilities[3]:
                prefix = (words[-1],)
                if prefix not in self.probabilities[2]:
                    return max(self.probabilities[1], key=self.probabilities[1].get)
                return max(self.probabilities[2][prefix], key=self.probabilities[2][prefix].get)
            return max(self.probabilities[3][prefix], key=self.probabilities[3][prefix].get)


In [123]:
# create an instance of the language model
lm = VanillaLanguageModel(ngram_counts)

# generate predictions for a given context
context = "the quick"
predictions = lm.predict_next_word(context)

print(predictions)


ValueError: too many values to unpack (expected 2)

In [125]:
class VanillaLanguageModel:
    def __init__(self, ngram_counts, alpha=1):
        self.probabilities = {}
        self.alpha = alpha
        
        # unigram model
        self.probabilities[1] = {}
        for word, count in ngram_counts[1].items():
            self.probabilities[1][word] = (count + alpha) / (sum(ngram_counts[1].values()) + alpha * len(ngram_counts[1]))

        # bigram model
        self.probabilities[2] = {}
        for prefix, count in ngram_counts[2].items():
            if prefix[0] not in self.probabilities[2]:
                self.probabilities[2][prefix[0]] = {}
            self.probabilities[2][prefix[0]][prefix[1]] = (count + alpha) / (sum([c for p, c in ngram_counts[2].items() if p[0] == prefix[0]]) + alpha * len(ngram_counts[1]))

        # trigram model
        self.probabilities[3] = {}
        for prefix, count in ngram_counts[3].items():
            if (prefix[0], prefix[1]) not in self.probabilities[3]:
                self.probabilities[3][(prefix[0], prefix[1])] = {}
            self.probabilities[3][(prefix[0], prefix[1])][prefix[2]] = (count + alpha) / (sum([c for p, c in ngram_counts[3].items() if p[0] == prefix[0] and p[1] == prefix[1]]) + alpha * len(ngram_counts[1]))

    def predict_next_word(self, context):
        words = context.split()
        if len(words) == 0:
            return None
        elif len(words) == 1:
            return max(self.probabilities[1], key=self.probabilities[1].get)
        elif len(words) == 2:
            prefix = (words[0], words[1])
            if prefix not in self.probabilities[2]:
                return max(self.probabilities[1], key=self.probabilities[1].get)
            return max(self.probabilities[2][prefix], key=self.probabilities[2][prefix].get)
        else:
            prefix = (words[-2], words[-1])
            if prefix not in self.probabilities[3]:
                prefix = (words[-1],)
                if prefix not in self.probabilities[2]:
                    return max(self.probabilities[1], key=self.probabilities[1].get)
                next_word = max(self.probabilities[2][prefix], key=self.probabilities[2][prefix].get)
                return ' '.join([words[-1], next_word])
            next_word = max(self.probabilities[3][prefix], key=self.probabilities[3][prefix].get)
            return ' '.join([words[-1], next_word])


In [127]:
# create an instance of the class
model = VanillaLanguageModel(ngram_counts)

# use the model to predict the next word
context = 'my name'
next_word = model.predict_next_word(context)
print(next_word)  # output: 'brown'

's
