In [1]:
# Imports
import glob
import os
import string
import sys
import tarfile

# Path setup
sys.path.append("/data/workspace/lexpredict-contraxsuite-core/")

# Packages
from lexnlp.nlp.en.segments.sentences import get_sentences
from lexnlp.nlp.en.tokens import get_stem_generator, get_stems

# Gensimf
import gensim.models.word2vec
import gensim.models.doc2vec
import numpy

# Sklearn
import sklearn.linear_model
import sklearn.ensemble

In [2]:
def process_sentence(sentence):
    sentence_stems = [s for s in get_stems(sentence, stopword=True, lowercase=True) if s.isalpha()]
    return sentence_stems

def process_document(document):
    doc_words = []
    for sentence in get_sentences(document):
        doc_words.extend(process_sentence(sentence))
    return doc_words

In [3]:
#d2v_model = gensim.models.word2vec.Word2Vec.load("../models/d2v_cbow_lease_size200_window10")
d2v_model = gensim.models.word2vec.Word2Vec.load("../models/d2v_all_size50_window5")

In [5]:
path_list = ["/data/workspace/lexpredict-contraxsuite-samples/agreements/construction/",
            "/data/workspace/lexpredict-contraxsuite-samples/agreements/credit/",
             "/data/workspace/lexpredict-contraxsuite-samples/agreements/employment/",
             "/data/workspace/lexpredict-contraxsuite-samples/agreements/software_license/",
            ]

In [12]:
# Sample data
doc_vectors = []
doc_targets = []

# Iterate through class paths
for path in path_list:
    # Get class name from path
    class_name = path.strip("/").split("/").pop()
    print(class_name)
    
    # Get all files
    for file_name in os.listdir(path)[0:10]:
        # Check type
        if not file_name.lower().endswith("txt"):
            continue
        
        # Read file
        with open(os.path.join(path, file_name), "rb") as file_handle:
            file_buffer = file_handle.read().decode("utf-8")
            doc_words = process_document(file_buffer)
            doc_vectors.append(d2v_model.infer_vector(doc_words))
            doc_targets.append(class_name)            

construction
credit
employment
software_license


In [13]:
# Test logistic model
log_model = sklearn.linear_model.LogisticRegressionCV(multi_class="multinomial")
log_model.fit(doc_vectors, doc_targets)

# Test in-sample
log_predicted = log_model.predict(doc_vectors)
print(sklearn.metrics.classification_report(doc_targets, log_predicted))

                  precision    recall  f1-score   support

    construction       1.00      1.00      1.00        10
          credit       1.00      1.00      1.00        10
      employment       1.00      1.00      1.00        10
software_license       1.00      1.00      1.00        10

     avg / total       1.00      1.00      1.00        40



In [14]:
# Test logistic model
rf_model = sklearn.ensemble.RandomForestClassifier()
rf_model.fit(doc_vectors, doc_targets)

# Test in-sample
rf_predicted = rf_model.predict(doc_vectors)
print(sklearn.metrics.classification_report(doc_targets, rf_predicted))

                  precision    recall  f1-score   support

    construction       1.00      1.00      1.00        10
          credit       1.00      1.00      1.00        10
      employment       1.00      1.00      1.00        10
software_license       1.00      1.00      1.00        10

     avg / total       1.00      1.00      1.00        40



In [15]:
# Sample data
test_doc_vectors = []
test_doc_targets = []
test_doc_predicted = []

# Iterate through class paths
for path in path_list:
    # Get class name from path
    class_name = path.strip("/").split("/").pop()
    print(class_name)
    
    # Get all files
    for file_name in os.listdir(path)[50:150]:
        # Check type
        if not file_name.lower().endswith("txt"):
            continue
        
        # Read file
        with open(os.path.join(path, file_name), "rb") as file_handle:
            file_buffer = file_handle.read().decode("utf-8")
            doc_words = process_document(file_buffer)
            doc_vector = d2v_model.infer_vector(doc_words)
            test_doc_vectors.append(doc_vector)
            test_doc_targets.append(class_name)

# Predict
log_test_doc_predicted = log_model.predict(test_doc_vectors).tolist()
rf_test_doc_predicted = rf_model.predict(test_doc_vectors).tolist()

# Output report
print("Logistic:")
print(sklearn.metrics.classification_report(test_doc_targets, log_test_doc_predicted))

print("Random Forest:")
print(sklearn.metrics.classification_report(test_doc_targets, rf_test_doc_predicted))

construction
credit
employment
software_license
Logistic:
                  precision    recall  f1-score   support

    construction       0.93      0.77      0.84       100
          credit       0.93      0.89      0.91       100
      employment       0.83      1.00      0.91       100
software_license       0.98      1.00      0.99        50

     avg / total       0.91      0.90      0.90       350

Random Forest:
                  precision    recall  f1-score   support

    construction       0.79      0.73      0.76       100
          credit       0.87      0.77      0.81       100
      employment       0.80      0.98      0.88       100
software_license       0.96      0.88      0.92        50

     avg / total       0.84      0.83      0.83       350

