1.Parse the document to extract the data in the XML's < raw > tag

In [33]:
import os
from collections import defaultdict
from KafNafParserPy import KafNafParser

# Directory containing the NAF files
directory = './WES-Dataset/docs/'
documents = defaultdict(lambda: defaultdict(list))

# Loop over all files in the directory
for filename in os.listdir(directory):
    # Check if the file is a NAF file
    if filename.endswith('.naf'):
        # Full path to the NAF file
        filepath = os.path.join(directory, filename)

        # Open and parse the NAF file
        with open(filepath, 'r') as file:
            naf_obj = KafNafParser(file)

            # find the <raw> tag and extract its text content
            raw_data = naf_obj.get_raw()
            documents[filename] = raw_data



2. Tokenise the documents’ content

In [34]:
import nltk
nltk.download('punkt')
for doc in documents:
    raw_data = documents[doc]
    if(raw_data is not None):
        tokens = nltk.word_tokenize(raw_data)
        documents[doc] = tokens

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/malcolmborg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


3. Perform case-folding, stop-word removal and stemming 

In [35]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

def stem(token):
    if token.endswith('s') or token.endswith('es'):
        return token[:-1]
    elif token.endswith('ed'):
        return token[:-2]
    elif token.endswith('ing'):
        return token[:-3]
    return token
for doc in documents:
    tokens = documents[doc]    
    if(tokens is not None and len(tokens) > 0):
        processed_tokens = [stem(token.lower()) for token in tokens if token.lower() not in stopwords]
        documents[doc] = processed_tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/malcolmborg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


4.Build the term by document matrix containing the T F.IDF weight
for each term within each document

In [36]:

import math

term_frequencies = defaultdict(lambda: defaultdict(int))
document_frequencies = defaultdict(int)

for doc in documents:
    tokens = documents[doc]
    for token in tokens:
        term_frequencies[doc][token] += 1
        document_frequencies[token] += 1

tfidf_matrix = defaultdict(lambda: defaultdict(float))
for document in term_frequencies:
    for token in term_frequencies[document]:
        tf = term_frequencies[document][token]
        df = document_frequencies[token]
        tfidf_matrix[document][token] = tf * math.log(len(documents) / df)
