In [None]:
!pip install python-docx
!pip install nltk
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np 
import pandas as pd

# Part 1: Text Extraction and Segmentation


In [None]:
import docx
from docx import Document
from docx.shared import Inches
import sys
import re

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Case 1: If the document uses the headings style

In [None]:
# for the document conatains headings style
def read_file_style(filename):
  try:
    document = docx.Document(filename)
    headings = []
    texts = []
    para = []
    for paragraph in document.paragraphs:
      # if the text uses headings style, append headings to headings as section titles
      if paragraph.style.name.startswith("Heading"):
        if headings:
          texts.append(para)
        headings.append(paragraph.text)
        para = []
      # else if, keep paragraphs to texts list  
      elif paragraph.style.name == "Normal":
        # reomve empty lines
        if paragraph.text != '':
          para.append(paragraph.text)
    if para or len(headings)>len(texts):
      texts.append(texts.append(para))
    
    # remove None elements in the list
    texts = [i for i in texts if i is not None] 
    # remove nested lists
    cleaned_texts = []
    [cleaned_texts.append(''.join(i)) for i in texts]
    
    return headings, cleaned_texts

  except IOError:
    print("Error opening or reading input file: ", filename)
    sys.exit()

## Case 2: If section number of the file starts with a number or a roman numerals

In [None]:
# for the document contains section numbers
def read_file_num(filename):
  try:
    doc = docx.Document(filename)
    texts = []
    for para in doc.paragraphs:
      texts.append(para.text)
    # remove empty lines
    texts = [x for x in texts if x.strip()] 

    # identify and segement the section
    y = "^(\d*\. |M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\. )+"
    joined_texts = []
    current_sentence = ""
    for sentence in texts:
      # if the section startwith numbers, eg. "1. " or roman numerals, eg. "I. ", use regex expression to detect
      x = re.search(y, sentence)
      # group the paragraphs by section numbers
      if current_sentence and x:
        joined_texts.append(current_sentence.strip())
        current_sentence = ""
      current_sentence += sentence + " "
    if current_sentence:
      joined_texts.append(current_sentence.strip())

    # extract section titles from paragraphs
    titles = []
    for i in joined_texts:
      x = re.search(y, i)
      # if there is a section number, append the section title
      if x : 
        if not i.startswith('V. '):
          titles.append(sent_tokenize(i)[1])
        else:
          titles.append(sent_tokenize(i)[0])
      # if not, append the first sentence of the paragraph
      else:
        titles.append(sent_tokenize(i)[0])

    return titles, joined_texts
    
  except IOError:
    print("Error opening or reading input file: ", filename)
    sys.exit()

In [None]:
# Case 1
# read playbook
doc1 = read_file_style('/content/Employment-Contract-Template-Download-20201125.docx')
# read uploaded document
doc2 = read_file_style('/content/event-vendor-contract.docx')

In [None]:
# Case 2
# read playbook
doc3 = read_file_num('/content/Employment-Contract-Agreement.docx')
# read uploaded document
doc4 = read_file_num('/content/At-Will-Employment-Contract.docx')

# Part 2: Text Matching

In [None]:
# Use nltk's English stopwords
stopwords = nltk.corpus.stopwords.words('english') 
new_stopwords = ['shall','agreement','contract','party','date','one','may','hundred','time','will','make','term','section','thousand']
stopwords.extend('new_stopwords')

### calculte the weighted average of two scores

In [None]:
# assign weights to both score and calculate the weighted average

# get the weighted title score
def similarity_title(weight, playbook, document):

  # exclude stop words and tokenize words of headings (NLTK) and create dictionary of the playbook
  gen_docs = [[w.lower() for w in word_tokenize(i) if w not in stopwords] for i in playbook[0]]
  dictionary = gensim.corpora.Dictionary(gen_docs)
  # create a bag of words and its frequency in each document
  corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
  # calculate TFIDF
  tf_idf = gensim.models.TfidfModel(corpus)
  # creating similarity measure object
  sims = gensim.similarities.Similarity('/content/Untitled Folder',tf_idf[corpus], num_features=len(dictionary))

  # processing headings of the second document
  score_title = []
  for i in range(len(document[0])):
    query_doc = [w.lower() for w in word_tokenize(document[0][i]) if w not in stopwords]
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    # find similarity for each document and assign the weight
    score_title.append(list(weight * sims[query_doc_tf_idf]))

  return score_title

# get the weighted paragraph score  
def similarity_para(weight, playbook, document):

  # exclude stop words and tokenize words of paragraphs (NLTK) and create dictionary of the playbook
  gen_docs = [[w.lower() for w in word_tokenize(i) if w not in stopwords] for i in playbook[1]]
  dictionary = gensim.corpora.Dictionary(gen_docs)
  # create a bag of words and its frequency in each document
  corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
  # calculate TFIDF
  tf_idf = gensim.models.TfidfModel(corpus)
  # creating similarity measure object
  sims = gensim.similarities.Similarity('/content/Untitled Folder',tf_idf[corpus], num_features=len(dictionary))

  # processing paragraphs of the second document
  score_para = []
  for i in range(len(document[1])):
    query_doc = [w.lower() for w in word_tokenize(document[1][i]) if w not in stopwords]
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    # find similarity for each document and assign the weight
    score_para.append(list(weight * sims[query_doc_tf_idf]))
  
  return score_para

# calculate the weighted average of each sections    
def weighted_avg(title_score, para_score):
  results = []
  for i in range(len(title_score)):
    nested_results = []
    for j in range(len(title_score[i])):
      weighted_avg = (title_score[i][j] + para_score[i][j])/40
      nested_results.append(weighted_avg)
    results.append(nested_results)
  return results

In [None]:
# get the score
title_score = similarity_title(10, doc3, doc4)
para_score = similarity_para(30, doc3, doc4)
final_score = weighted_avg(title_score, para_score)

  result = numpy.hstack(shard_results)


## match the section

In [None]:
# match the paragaphs with the highest score
matched = []
for i in final_score: 
  indices = np.argmax(i)
  matched.append(doc3[1][indices])

for i in range(len(doc4[1])):
  print("Document: ", doc4[1][i])
  print("Playbook: ", matched[i])
  print()

Document:  AT-WILL EMPLOYMENT AGREEMENT This employment agreement (“Agreement”) is made and effective as of ___________________, 20___ by and between a(n)  Individual  Business Entity known as ___________________ having its principal place of business at ___________________, City of ___________________, State of ___________________ (“Employer”) and ___________________ with a mailing address of ___________________, City of ___________________, State of ___________________ (“Employee”). WHEREAS the Employer intends to hire the Employee for the position of ___________________ and the Employee desires to provide their services on the conditions set forth. IN CONSIDERATION of promises and other good and valuable consideration the parties agree to the following:
Playbook:  EMPLOYMENT CONTRACT AGREEMENT This Employment Contract (“Agreement”) is made as of the ____ day of ______________________, 20____ between ______________________ with a mailing address of ______________________, City of ___