In [111]:
# Imports
import glob
import os
import string
import pickle
import unicodedata

# Sklearn imports
import nltk
import nltk.collocations

from segmenters import *

In [3]:
# Setup default path for documents
document_path = "/data/workspace/lexpredict-contraxsuite-samples/"
document_type = "*"

In [6]:
def build_file_list(path, extension=None):
    file_list = []
    for file_name in os.listdir(path):
        if os.path.isdir(os.path.join(path, file_name)):
            file_list.extend(build_file_list(os.path.join(path, file_name)))
        elif os.path.isfile(os.path.join(path, file_name)):
            if extension and file_name.lower().endswith(extension.lower()):
                file_list.append(os.path.join(path, file_name))
            else:
                file_list.append(os.path.join(path, file_name))
    return file_list

In [106]:
# File list
file_list = build_file_list(document_path)
file_list = file_list

In [107]:
# Load document data
document_data = []
for file_name in file_list:
    document_tokens = []
    with open(file_name, "r", encoding="utf-8") as file_handle:
        try:
            file_buffer = file_handle.read()
        except UnicodeDecodeError as e:
            continue
        
        for sentence in get_sentences(file_buffer):
            document_tokens.extend([t.lower() for t in nltk.word_tokenize(sentence) if t.isalnum()])
    document_data.extend(document_tokens)

In [140]:
# Get raw token counts
token_counts = pandas.Series(document_data).value_counts()

In [146]:
print(token_counts.head(100).index.tolist())

['the', 'of', 'or', 'to', 'and', 'any', 'in', 'a', 'such', 'shall', 'by', 'be', 'as', 'for', 'with', 'borrower', 'agreement', 'other', 'this', 'all', 'that', 'is', 'on', 'section', 'not', 'lender', 'its', 'which', 'under', 'date', 'agent', 'each', 'at', 'company', 'may', 'loan', 'time', 'bank', 'from', 'an', 'have', 'if', 'interest', 'amount', 's', 'credit', 'contractor', 'will', 'b', 'are', 'no', 'rate', 'owner', 'payment', 'provided', 'has', 'respect', 'period', 'obligations', 'applicable', 'party', 'notice', 'documents', 'i', 'including', 'it', 'than', 'business', 'upon', 'event', 'means', 'made', 'required', 'person', 'pursuant', 'ii', 'executive', 'after', 'administrative', 'c', 'without', 'thereof', 'day', 'page', 'otherwise', 'hereunder', 'loans', 'default', 'been', 'terms', 'accordance', 'property', 'work', 'days', 'effect', 'construction', 'set', 'law', 'material', 'forth']


In [152]:
# Setup stopwords from corpus
stopwords = set(list(nltk.corpus.stopwords.words("english")))
stopwords.update(['the', 'of', 'or', 'to', 'and', 'any', 'in', 'a', 'such', 'shall', 'by', 'be', 'as', 'for',
                  'with', 'other', 'this', 'all', 'that', 'is', 'on', 'not', 'its', 'which', 'under', 'each',
                  'at', 'may', 'from', 'an', 'have', 'if', 's', 'will', 'b', 'are', 'no', 'has', 
                  'i', 'including', 'it', 'than', 'upon', 'after', 'without', 'thereof', 'otherwise', 'hereunder', 
                  'been', 'forth'])
print("Total stopwords: {0}".format(len(stopwords)))

# Save the tokenizer
with open("stopwords.pickle", "wb") as out_file:
    pickle.dump(stopwords, out_file)

Total stopwords: 163


In [153]:
# Setup bigram finder
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words(document_data)

In [154]:
top_n_list = [100, 1000, 10000]

for n in top_n_list:
    # Apply filter and output
    finder.apply_freq_filter(0.001 * len(document_data))
    bigram_collocations = list(finder.nbest(bigram_measures.pmi, n))

    # Save the tokenizer
    with open("collocation_bigrams_{0}.pickle".format(n), "wb") as out_file:
        pickle.dump(bigram_collocations, out_file)

In [155]:
# Setup bigram finder
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = nltk.collocations.TrigramCollocationFinder.from_words(document_data)

In [156]:
top_n_list = [100, 1000, 10000]

for n in top_n_list:
    # Apply filter and output
    finder.apply_freq_filter(0.0001 * len(document_data))
    trigram_collocations = list(finder.nbest(trigram_measures.pmi, n))

    # Save the tokenizer
    with open("collocation_trigrams_{0}.pickle".format(n), "wb") as out_file:
        pickle.dump(trigram_collocations, out_file)