In [1]:
# Standard imports
import glob
import os
import pickle
import sys
import unidecode

# Tika imports
import tika.parser

# NLTK imports
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters, PunktTrainer

In [2]:
# Setup default path for documents
document_path = "/data/workspace/lexpredict-contraxsuite-samples/"
document_type = "txt"

In [3]:
def build_file_list(path, extension=None):
    file_list = []
    for file_name in os.listdir(path):
        if os.path.isdir(os.path.join(path, file_name)):
            file_list.extend(build_file_list(os.path.join(path, file_name)))
        elif os.path.isfile(os.path.join(path, file_name)):
            if extension and file_name.lower().endswith(extension.lower()):
                file_list.append(os.path.join(path, file_name))
            else:
                file_list.append(os.path.join(path, file_name))
    return file_list

In [4]:
# Iterate through file list            
file_list = build_file_list(document_path, "txt")

# Get random sample of N
M = 10
N = 100
from numpy.random import choice
file_list = choice(file_list, N)

# Setup tokenizer
punkt_trainer = PunktTrainer()

print("training on {0} files...".format(len(file_list)))
i = 0
for file_name in file_list:
    # Load document
    tika_response = tika.parser.from_file(os.path.join(document_path, file_name))
    try:
        tika_content = unidecode.unidecode(tika_response["content"])
    except KeyError as e:
        continue
    punkt_trainer.train(tika_content, verbose=False, finalize=False)
    
    # Add to text buffer
    if i % 10 == 0:
        print("training on: {0}".format(file_name))
        print("total abbreviations found: {0}".format(len(punkt_trainer._params.abbrev_types)))
    i += 1
    
punkt_trainer.train("This is the exemption section of the I.R.C.", verbose=False, finalize=False)
punkt_trainer.finalize_training(verbose=False)

training on 100 files...
training on: /data/workspace/lexpredict-contraxsuite-samples/agreements/construction/1040596_2001-11-14_CONSTRUCTION AGREEMENT DATED AUGUST 29, 2001.txt
total abbreviations found: 2
training on: /data/workspace/lexpredict-contraxsuite-samples/plans/retirement/819793_2008-01-02_SUPPLEMENTAL EXECUTIVE RETIREMENT PLAN.txt
total abbreviations found: 31
training on: /data/workspace/lexpredict-contraxsuite-samples/agreements/employment/700565_2010-05-03_EMPLOYMENT AGREEMENTS.txt
total abbreviations found: 45
training on: /data/workspace/lexpredict-contraxsuite-samples/agreements/credit/896778_2011-08-26_CREDIT AGREEMENT.txt
total abbreviations found: 52
training on: /data/workspace/lexpredict-contraxsuite-samples/agreements/severance/810130_1999-02-26_SEVERANCE AGREEMENT.txt
total abbreviations found: 59
training on: /data/workspace/lexpredict-contraxsuite-samples/agreements/software_license/1414043_2016-10-14_SOFTWARE LICENSE AGREEMENT DATED OCTOBER 7, 2016 BY AND B

In [5]:
# Create a tokenizer and test
punkt_params = punkt_trainer.get_params()

# Add required set
required_list = [
    # Entity types
    'Inc.', 'L.L.C.', 'Ltd.', 'L.P.', 'S.A.',
 'INC.', 'S.A. de C.V.', 'S. de R.L. de C.V.', 'S.L.', 'Inc.',
 'LTD.', 'B.V.', 'LLC.', 'Lda.', 'Ltda.', 'S.R.L.', 's.r.o.',
 'S.A.S.', 'S.A. DE C.V.', 'C.A.', 'Corp.', 'S.L.U.', 'S.A. De C.V.',
 'S. DE R.L. DE C.V.', 'L.L.P.','K.K.', 'C.V.', 'N.A.', 'Ltd.', 'S.r.l.',
 'S.A.R.L.', 'S. de R. L. de C.V.', 'S. De R.L. De C.V.', 'S.R.L. de C.V.', 'G.P.',
 'S.A.de C.V.', 'L.P.', 'N.V.', 'S de R.L. de C.V.', 'S.C.A.',
 'Sdn. Bhd.', 'S.R.O.', 'L.L.L.P.', 'S.de R.L. de C.V.', 'Pte. Ltd.',
 'S.A.U.', 'S.C.', 'S.a.r.l.', 'S. De R.L. de C.V.',
    # Court/regulatory abbreviations
    'U.S.', 'U.S.C.', 'I.R.S.', 'Treas.', 
    'Tex.', 'Bus.', 'Com.',
    
    # Honorifics
    'Mr', 'Ms', 'Mrs', 'Dr', 'Prof', 'Sr', 'Jr',
                ]

for abbrev in required_list:
    punkt_params.abbrev_types.add(abbrev.strip(".").lower())

tokenizer = PunktSentenceTokenizer(punkt_params)
print("total abbreviations: {0}".format(len(punkt_params.abbrev_types)))

total abbreviations: 125


In [10]:
tokenizer.tokenize("""Batman, Esq., J.D., M.B.A., is the author of 26 U.S.C. 501, i.e., the exemption section of the IRC.
Therefore, e.g., joke about INC. or LLC. entities run by Dr. Brown.
Did you know that the I.R.S. loves to fine non-U.S. Acme Ltda. at c.a. 10:00A.M.""")

['Batman, Esq., J.D., M.B.A., is the author of 26 U.S.C. 501, i.e., the exemption section of the IRC.',
 'Therefore, e.g., joke about INC. or LLC. entities run by Dr. Brown.',
 'Did you know that the I.R.S. loves to fine non-U.S. Acme Ltda. at c.a. 10:00A.M.']

In [7]:
tokenizer.tokenize("""Employee acknowledges that the Confidential Information provided to Employee pursuant to this Agreement, and Company’s need to protect its goodwill,
gives rise to Company’s interest in these restrictive covenants, and that any limitations as to time, geographic scope, and scope of activity to be restrained defined herein are reasonable and do not impose a greater restraint than is necessary
to protect the goodwill or other business interest of Company. The Employee further agrees that if, at some later date, a court of competent jurisdiction determines that these covenants do not meet the criteria set forth in Tex. Bus. & Com.
Code § 15.50(2), these agreements shall be reformed by the court, pursuant to Tex. Bus. & Com. Code § 15.51(c), by the least extent necessary to make them enforceable. Employee acknowledges and recognizes that the enforcement of
any of the provisions in this Agreement by Company will not interfere with the Employee’s ability to pursue a proper livelihood.""")

['Employee acknowledges that the Confidential Information provided to Employee pursuant to this Agreement, and Company’s need to protect its goodwill,\ngives rise to Company’s interest in these restrictive covenants, and that any limitations as to time, geographic scope, and scope of activity to be restrained defined herein are reasonable and do not impose a greater restraint than is necessary\nto protect the goodwill or other business interest of Company.',
 'The Employee further agrees that if, at some later date, a court of competent jurisdiction determines that these covenants do not meet the criteria set forth in Tex. Bus. & Com.\nCode § 15.50(2), these agreements shall be reformed by the court, pursuant to Tex. Bus. & Com. Code § 15.51(c), by the least extent necessary to make them enforceable.',
 'Employee acknowledges and recognizes that the enforcement of\nany of the provisions in this Agreement by Company will not interfere with the Employee’s ability to pursue a proper livel

In [8]:
import nltk
nltk.sent_tokenize("""Batman, Esq., J.D., M.B.A., is the author of 26 U.S.C. 501, i.e., the exemption section of the I.R.C.
Therefore, e.g., joke about INC. or LLC. entities.
Did you know that the I.R.S. loves to fine non-U.S. Acme Ltda. at c.a. 10:00A.M.""")

['Batman, Esq., J.D., M.B.A., is the author of 26 U.S.C.',
 '501, i.e., the exemption section of the I.R.C.',
 'Therefore, e.g., joke about INC. or LLC.',
 'entities.',
 'Did you know that the I.R.S.',
 'loves to fine non-U.S. Acme Ltda.',
 'at c.a.',
 '10:00A.M.']

In [9]:
# Save the tokenizer
with open("../../../lexnlp/nlp/en/sentence_segmenter.pickle", "wb") as out_file:
    pickle.dump(tokenizer, out_file)