In [81]:
# import required module
import os
import xml.etree.ElementTree as ET
import spacy
import nltk
import time

#if needed
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [31]:


# parse the XML document
tree = ET.parse(r"British baby corpus\fic\AB9.xml")
root = tree.getroot()
docsFiles = []

# find the <wtext> element
wtext_element = root.find('.//wtext')

# iterate over the <s> elements and extract the text
for s_element in wtext_element.iter('s'):
    sentence_text = ''
    for w_element in s_element.iter('w'):
        sentence_text += w_element.text + ' '
    #print(sentence_text.strip())
    docsFiles.append(sentence_text.strip())


In [32]:
print(docsFiles[0])

Detective  Chief  Inspector  John  McLeish  gazed  doubtfully  at  the  plate  before  him


In [33]:
#Tokenisation
# Splitting each element(each element represent a file) into multiple elements
token =[]
for x in docsFiles:
    token.append(x.split())
    
#print(token)

In [34]:
#Performing casefold

casefold = []
temp = []

#iterating throught the 2d array and converting each token in a lower case
for x in range(len(token)):
    for t in token[x]:
        temp.append(t.casefold())
        
    casefold.append(temp)
    temp = []

print(casefold[0])

['detective', 'chief', 'inspector', 'john', 'mcleish', 'gazed', 'doubtfully', 'at', 'the', 'plate', 'before', 'him']


In [36]:
#Performing stop words removal

stop_words = set(stopwords.words('english'))


stopWordRemoval = []
temp2 = []
 
# iterating through the lower case tokens and removing unimportant words (that are generated and stored in stop_words) 
#like 'a', 'the', etc...
for x in range(len(casefold)):
    for w in casefold[x]:
        if w not in stop_words:
            temp2.append(w)
    
    stopWordRemoval.append(temp2)
    temp2 = []
        
print(stopWordRemoval[0])

['detective', 'chief', 'inspector', 'john', 'mcleish', 'gazed', 'doubtfully', 'plate']


In [39]:
#Performing stemming
ps = PorterStemmer()
stemming = []
temp3 = []
    
# iterating through the stopWordRemoval and editing the word into a more general manner for example 'running' to 'run'    
for x in range(len(stopWordRemoval)):
    for w in stopWordRemoval[x]:
        temp3.append(ps.stem(w))
    
    stemming.append(temp3)
    temp3 = []
    
print(stemming[0])

['detect', 'chief', 'inspector', 'john', 'mcleish', 'gaze', 'doubt', 'plate']


In [78]:
def build_ngram_counts(text, n):
    ngram_counts = {}
    words = text.split()
    for i in range(len(words)-n+1):
        ngram = ' '.join(words[i:i+n])
        if ngram in ngram_counts:
            ngram_counts[ngram] += 1
        else:
            ngram_counts[ngram] = 1
    return ngram_counts


In [95]:
start_time = time.time()

my_str = ' '.join([' '.join(lst) for lst in stemming])
#print(my_str)

nGram = build_ngram_counts(my_str, 1)
print(nGram)
end_time = time.time()

print("Time taken: ", end_time - start_time, "seconds")

{'detect': 10, 'chief': 18, 'inspector': 15, 'john': 47, 'mcleish': 302, 'gaze': 11, 'doubt': 5, 'plate': 6, 'thought': 79, 'hungri': 1, 'realiz': 29, 'actual': 15, 'need': 48, 'anyth': 21, 'rather': 32, 'overflow': 1, 'cholesterol': 1, 'canteen': 3, 'new': 33, 'scotland': 14, 'yard': 26, 'provid': 5, 'admir': 8, 'prompt': 2, 'sleep': 9, 'would': 171, 'perhap': 18, 'make': 49, 'sens': 9, 'thirty-six': 4, 'hour': 28, 'straight': 14, 'duti': 6, 'much': 60, 'spent': 10, 'sullen': 2, 'jamaican': 1, 'kill': 17, 'landladi': 1, 'three': 30, 'children': 18, 'crowd': 2, 'kitchen': 10, 'hous': 24, 'behind': 15, 'westway': 2, 'took': 35, 'experiment': 1, 'mouth': 13, 'fri': 2, 'egg': 3, 'wait': 41, 'see': 78, 'go': 150, 'suit': 18, 'progress': 1, 'bake': 2, 'bean': 3, 'cautious': 8, 'finish': 12, 'one': 134, 'sausag': 1, 'decid': 63, 'bread': 1, 'tempt': 1, 'fate': 1, 'push': 16, 'asid': 5, 'reach': 16, 'cup': 15, 'tea': 18, 'rest': 13, 'elbow': 5, 'tabl': 19, 'lift': 12, 'hand': 39, 'weari': 3, 