# Utilities

In [1]:
from xml.dom import minidom
import os
import nltk
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open

In [2]:
def read_file(path):
    """ Read each naf file and save doc_id, title and text.
    
    Args:
        path (str): path to naf file.
    
    Returns:
        dict: dictionary with doc_id, title and text.
    
    """
    
    # Parses naf file from path
    my_doc = minidom.parse(path)
    
    # Extract elements from parsed naf file
    doc_id = my_doc.getElementsByTagName('public')[0].attributes['publicId'].value    
    title = my_doc.getElementsByTagName('fileDesc')[0].attributes['title'].value
    text = my_doc.getElementsByTagName('raw')[0].firstChild.data
    
    # Returns dictionary
    return {
        'title':title,
        'doc_id': doc_id,
        'text': text
    }

In [3]:
""" Creates dictionary with retrieved file's information (doc_id, title, text)

Args:
    base_path (str): path to naf files.

Returns:
    doc_list: list with dictionary per naf file.

"""
base_path = './data/docs-raw-texts'
doc_files = os.listdir(base_path)
doc_list = []
for i in doc_files:
    doc_list.append(read_file(base_path + '/' + i))

In [4]:
def process(p, tokenizer, text):
    """ Applies standard pre-processing to given text.
    
    Args:
        p (gensim.parsing.porter.PorterStemmer): stemmer object.
        tokenizer (nltk.tokenize.regexp.RegexpTokenizer): tokenizr object.
        text (str): text to preprocess.
    
    Returns:
        list: preprocessed text.
    
    """
    # Converts to lowercase
    doc_nor = text.lower()
    
    # Removes stopwords
    doc_sw = remove_stopwords(doc_nor)
    
    # Stems text
    doc_stem = p.stem_sentence(doc_sw)
    
    # Lemmatizes text
    # TODO: Lemmatizer
    
    # Returns preprocessed text
    return tokenizer.tokenize(doc_stem)

In [5]:
""" Creates dictionary object and file from preprocessed text.

Args:
    doc_list (list): list with dictionary per naf file.

Returs:
    gensim.corpora.dictionary.Dictionary: complete dictionary.

"""

# Creates PorterStemmer instance.
p = PorterStemmer()

# Creates RegexpTokenizer (removes punctuation signs)
tokenizer = nltk.RegexpTokenizer(r'\w+')

docDict = []
for doc in doc_list:
    docDict.append(process(p, tokenizer, doc['title'] + doc['text']))
dictionary = corpora.Dictionary(docDict)
dictionary.save('vocab.dict')

In [6]:
""" Creates corpus efficiently based on doc_list

Args:
    doc_list (list): list with dictionary per naf file.
    
Returns: 
    (gensim.corpora.mmcorpus.MmCorpus): corpus file

"""

corpus = []
for doc in doc_list:
    corpus.append(dictionary.doc2bow(process(p, tokenizer, doc['title'] + doc['text'])))

# Serializes and saves dictionary file
corpora.MmCorpus.serialize("corpus.mm",corpus)