In [6]:
# Imports
import glob
import os
import string
import sys
import tarfile

# Path setup
sys.path.append("/data/workspace/lexpredict-contraxsuite-core/")

# Packages
from lexnlp.nlp.en.segments.sentences import get_sentence_list
from lexnlp.nlp.en.tokens import get_stems

import numpy
import pandas


# Gensim
import gensim.models.word2vec
import gensim.models.doc2vec

# Sklearn
import sklearn.linear_model
import sklearn.ensemble

In [14]:
def process_sentence(sentence):
    sentence_stems = [s for s in get_stems(sentence, stopword=True, lowercase=True) if s.isalpha()]
    return sentence_stems

def process_document(document):
    doc_words = []
    for sentence in get_sentence_list(document):
        doc_words.extend(process_sentence(sentence))
    return doc_words

In [102]:
#d2v_model = gensim.models.word2vec.Word2Vec.load("../models/d2v_cbow_lease_size200_window10")
d2v_model = gensim.models.word2vec.Word2Vec.load("../models/d2v_all_size200_window20")

In [103]:
# Load coding file
coding_df = pandas.read_csv("../all_coding.csv", low_memory=False)

In [None]:
# Sample data
doc_vectors = []
doc_targets = []
min_stem_count = 10

# Iterate through class paths
file_name = "/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz"

with tarfile.open(file_name, "r:gz") as corpus_tar_file:
    member_list = corpus_tar_file.getmembers()[0:10000]
    num_members = len(member_list)
    
    # Iterate through all
    for i, tar_member in enumerate(member_list):
        # Output
        if i % 100 == 0:
            print((file_name, i, float(i)/num_members * 100., tar_member.name, len(doc_vectors)))

        # Get class from coding file
        member_base_name = os.path.basename(tar_member.name)
        try:
            class_name = coding_df.loc[coding_df["output_file_name"] == member_base_name, "class"].values[0]
        except IndexError:
            continue
        
        # Read file
        member_file = corpus_tar_file.extractfile(tar_member.name)
        if member_file is None:
            print((file_name, tar_member.name, "invalid file"))
            continue
        member_buffer = member_file.read().decode("utf-8")
        if len(member_buffer.strip()) == 0:
            continue

        # Parse into sentence data
        try:
            doc_words = process_document(member_buffer)
            doc_vectors.append(d2v_model.infer_vector(doc_words))
            doc_targets.append(class_name)            
        except Exception as e:
            print(e)

('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 0, 0.0, 'agreements-text', 0)
('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 100, 1.0, 'agreements-text/878720_10-K_2000-04-28_6', 99)
('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 200, 2.0, 'agreements-text/1354942_8-K_2008-02-11_2', 199)
('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 300, 3.0, 'agreements-text/1135264_8-K_2007-01-05_15', 299)
('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 400, 4.0, 'agreements-text/775473_10-K_2005-03-16_2', 399)
('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 500, 5.0, 'agreements-text/69488_10-K_2006-03-16_4', 499)
('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 600, 6.0, 'agreements-text/1019825_8-K_2002-05-09_12', 599)
('/data/workspace/lexpredict-contraxsuite-

('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 5700, 56.99999999999999, 'agreements-text/942787_8-K_1998-06-22_9', 5699)
('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 5800, 57.99999999999999, 'agreements-text/350832_8-K_2013-01-15_2', 5799)
('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 5900, 59.0, 'agreements-text/1290205_8-K_2005-05-11_5', 5899)
('/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz', 6000, 60.0, 'agreements-text/1113129_10-Q_2000-11-14_10', 5999)


In [None]:
# Test logistic model
log_model = sklearn.linear_model.LogisticRegressionCV()
log_model.fit(doc_vectors, doc_targets)

# Test in-sample
log_predicted = log_model.predict(doc_vectors)
print(sklearn.metrics.classification_report(doc_targets, log_predicted))

In [None]:
# Test logistic model
rf_model = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
#rf_model = sklearn.ensemble.ExtraTreesClassifier(n_estimators=100)
rf_model.fit(doc_vectors, doc_targets)

# Test in-sample
rf_predicted = rf_model.predict(doc_vectors)
print(sklearn.metrics.classification_report(doc_targets, rf_predicted))

In [None]:
# Sample data
test_doc_vectors = []
test_doc_targets = []
min_stem_count = 10

# Iterate through class paths
file_name = "/data/workspace/lexpredict-contraxsuite-core/test_data/agreements-text.tar.gz"

with tarfile.open(file_name, "r:gz") as corpus_tar_file:
    member_list = corpus_tar_file.getmembers()[15000:16000]
    num_members = len(member_list)
    
    # Iterate through all
    for i, tar_member in enumerate(member_list):
        # Output
        if i % 100 == 0:
            print((file_name, i, float(i)/num_members * 100., tar_member.name, len(test_doc_vectors)))

        # Get class from coding file
        member_base_name = os.path.basename(tar_member.name)
        try:
            class_name = coding_df.loc[coding_df["output_file_name"] == member_base_name, "class"].values[0]
        except IndexError:
            continue
        
        # Read file
        member_file = corpus_tar_file.extractfile(tar_member.name)
        if member_file is None:
            print((file_name, tar_member.name, "invalid file"))
            continue
        member_buffer = member_file.read().decode("utf-8")
        if len(member_buffer.strip()) == 0:
            continue

        # Parse into sentence data
        try:
            test_doc_words = process_document(member_buffer)
            test_doc_vectors.append(d2v_model.infer_vector(test_doc_words))
            test_doc_targets.append(class_name)            
        except Exception as e:
            print(e)

In [114]:
# Predict
log_test_doc_predicted = log_model.predict(test_doc_vectors).tolist()
rf_test_doc_predicted = rf_model.predict(test_doc_vectors).tolist()

# Output report
print("Logistic:")
print(sklearn.metrics.classification_report(test_doc_targets, log_test_doc_predicted))

print("Random Forest:")
print(sklearn.metrics.classification_report(test_doc_targets, rf_test_doc_predicted))

Logistic:
                     precision    recall  f1-score   support

         consulting       0.68      0.52      0.59        25
             credit       0.93      0.97      0.95       218
    indemnification       0.90      0.95      0.92        19
   labor_employment       0.71      0.86      0.78       191
            license       0.90      0.82      0.86        34
real_estate_leasing       0.84      0.68      0.75        38
         securities       0.82      0.78      0.80       242
           services       0.75      0.35      0.47        26
         settlement       0.71      0.23      0.34        22
        transaction       0.73      0.76      0.74       181
            venture       0.00      0.00      0.00         3

        avg / total       0.80      0.80      0.79       999

Random Forest:
                     precision    recall  f1-score   support

         consulting       0.83      0.20      0.32        25
             credit       0.85      0.96      0.90      

  'precision', 'predicted', average, warn_for)
