## Paper 2 Data Workflow for Data Extraction - CUADv1 - Pre-Annotation

In [1]:
# Import the various libraries
import re, json, os, itertools
import pandas as pd
from tqdm import tqdm

### 1. File handling - CUADv1

In [2]:
# Path to each individual txt files converted from PDF
TC_PATH = "../CUAD-v1/full_contract_txt/"

# Path to folder containing all the CUAD data and files
MASTER_PATH = "../CUAD-v1/"

# Name of CSV file containing all the extracted clauses from the Atticus team
MASTER_CLAUSES = 'master_clauses.csv'

# Name of JSON file to export the agreement text and labels for data extraction
JSON_EXPORT = 'jsonl_cuadv1.json'

# Name of JSON file to export the agreement taxt and labels for further inspection
JSON_EXPORT_INSPECT = 'jsonl_cuadv1_inspect.json'

### 2. Text Data Preprocessing - CUADv1

In [3]:
# Walk through all .txt filenames and create a dataframe with the names of the files, sorted alpha/num
text_files = []
for (dirpath, dirnames, filenames) in os.walk(TC_PATH):
    text_files.extend(filenames)

tf_df = pd.DataFrame(data = text_files, columns = ['Text Files'])
tf_df.sort_values('Text Files', axis=0, inplace=True, ignore_index=True) 
tf_df.head()

Unnamed: 0,Text Files
0,2ThemartComInc_19990826_10-12G_EX-10.10_670028...
1,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...
2,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...
3,ACCURAYINC_09_01_2010-EX-10.31-DISTRIBUTOR AGR...
4,ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT A...


In [4]:
# Read master clauses CSV into a dataframe, sort by filename to match text file dataframe created above
mc_df = pd.read_csv(MASTER_PATH+MASTER_CLAUSES)

# Cut out the relevant info
mc_df_cut = mc_df[['Filename',
                   'Document Name',
                   'Document Name-Answer',
                   'Parties',
                   'Parties-Answer',
                   'Agreement Date',
                   'Agreement Date-Answer']].copy()

# Sort the dataframe by filename
mc_df_cut.sort_values('Filename', axis=0, inplace=True, ignore_index=True) 

# Bring in the list of the .txt filenames
mc_df_cut.insert(loc=1, column='Text Files', value=tf_df)

# Create a list of the names of the files, with index num
file_list = [(index, row['Text Files']) for index, row in mc_df_cut.iterrows()]

In [5]:
#Initial dataframe info
mc_df_cut.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Filename               510 non-null    object
 1   Text Files             510 non-null    object
 2   Document Name          510 non-null    object
 3   Document Name-Answer   510 non-null    object
 4   Parties                510 non-null    object
 5   Parties-Answer         509 non-null    object
 6   Agreement Date         510 non-null    object
 7   Agreement Date-Answer  465 non-null    object
dtypes: object(8)
memory usage: 32.0+ KB


In [6]:
# Create a function to clean up and pre-process the text.
# This process should be used for any document text inc. train, validation and test sets.
def pre_process_doc_common(text):
    # Simple replacement for "\n"
    text = text.replace("\n", " ")     
    
    # Simple replacement for "\xa0"
    text = text.replace("\xa0", " ")  
    
    # Simple replacement for "\x0c"
    text = text.replace("\x0c", " ")
    
    # Get rid of multiple dots
    regex = "\ \.\ "
    subst = "."
    text = re.sub(regex, subst, text, 0)
    
    # Get rid of underscores
    regex = "_"
    subst = " "
    text = re.sub(regex, subst, text, 0)
    
    # Get rid of multiple dashes
    regex = "--+"
    subst = " "
    text = re.sub(regex, subst, text, 0)
    
    # Get rid of multiple stars
    regex = "\*+"
    subst = "*"
    text = re.sub(regex, subst, text, 0)
    
    # Get rid of multiple whitespace
    regex = "\ +"
    subst = " "
    text = re.sub(regex, subst, text, 0)
    
    #Strip leading and trailing whitespace
    text = text.strip()
    
    return text

# Function to take in the file list, read each file, clean the text and return all agreements in a list
def text_data(file_list, print_text=False, clean_text=True, max_len=3000):
    text_list = []
    for index, filename in tqdm(file_list):
        agreement = open(TC_PATH+filename, "r")
        text = agreement.read()
        if print_text:
            print("Text before cleaning: \n", text)
        
        # Run text through cleansing function
        if clean_text:
            text = pre_process_doc_common(text)
        text = text[:max_len]
        len_text = len(text)
        
        if print_text:
            print("Text after cleaning: \n", text)
        
        text_list.append([index,
                  filename,
                  text,
                  len_text])
        
    return text_list

In [7]:
# Clean text and create dataframe with the text of ech document
data = text_data(file_list, print_text=False, clean_text=True, max_len=1000)
columns = ['ID', 'Documents', 'Text', 'Length_Of_Text']
text_df = pd.DataFrame(data=data, columns=columns)

# Add the two columns to a copy of the main dataframe
mc_df_wk = mc_df_cut.copy()
mc_df_wk = mc_df_wk.join(text_df[['Text', 'Length_Of_Text']])

#Ensure agreement date, doc_name and parties are list objects
mc_df_wk["Agreement Date"] = mc_df_wk["Agreement Date"].apply(eval)
mc_df_wk["Document Name"] = mc_df_wk["Document Name"].apply(eval)
mc_df_wk["Parties"] = mc_df_wk["Parties"].apply(eval)

# Some document name references have more than one entry - remove them for further inspection later
mc_df_wk['Doc_N_Length'] = mc_df_wk['Document Name'].str.len()
mc_df_mul = mc_df_wk[mc_df_wk.Doc_N_Length > 1]
mc_df_wk.drop(mc_df_mul.index, inplace=True)

# Have a look at the data
mc_df_wk.head(3)

100%|██████████| 510/510 [00:01<00:00, 405.51it/s]


Unnamed: 0,Filename,Text Files,Document Name,Document Name-Answer,Parties,Parties-Answer,Agreement Date,Agreement Date-Answer,Text,Length_Of_Text,Doc_N_Length
0,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,2ThemartComInc_19990826_10-12G_EX-10.10_670028...,[CO-BRANDING AND ADVERTISING AGREEMENT],CO-BRANDING AND ADVERTISING AGREEMENT,"[2THEMART.COM, INC., 2TheMart, i-Escrow, I-ESC...","I-ESCROW, INC. (""i-Escrow"" ); 2THEMART.COM, I...","[June 21, 1999]",6/21/99,CO-BRANDING AND ADVERTISING AGREEMENT THIS CO-...,1000,1
1,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,ABILITYINC_06_15_2020-EX-4.25-SERVICES AGREEME...,[Services Agreement],Services Agreement,"[""Provider"", TELCOSTAR PTE, LTD., Each of the ...","[ * * * ] (""Provider""); TELCOSTAR PTE, LTD.; A...","[October 1, 2019]",10/1/19,EXHIBIT 4.25 INFORMATION IN THIS EXHIBIT IDENT...,1000,1
2,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-...,[JOINT VENTURE AGREEMENT],JOINT VENTURE AGREEMENT,"[Pivotal Self Service Tech, Inc., (the ""Partie...","Collectible Concepts Group, Inc. (""CCGI""); Piv...",[],,EXHIBIT 10.13 JOINT VENTURE AGREEMENT Collecti...,1000,1


In [8]:
# Check for null values
mc_df_wk.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499 entries, 0 to 509
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Filename               499 non-null    object
 1   Text Files             499 non-null    object
 2   Document Name          499 non-null    object
 3   Document Name-Answer   499 non-null    object
 4   Parties                499 non-null    object
 5   Parties-Answer         498 non-null    object
 6   Agreement Date         499 non-null    object
 7   Agreement Date-Answer  458 non-null    object
 8   Text                   499 non-null    object
 9   Length_Of_Text         499 non-null    int64 
 10  Doc_N_Length           499 non-null    int64 
dtypes: int64(2), object(9)
memory usage: 46.8+ KB


In [9]:
# Agreement date is an important label. Here we will drop any agreement without a date.
# These will typically be template or specimen agreements which havent been executed
# Prior to dropping, we create a dataframe to manually check and annotate agreement date in a different exercise
mc_df_nul = mc_df_wk[mc_df_wk["Agreement Date-Answer"].isnull()]
mc_df_wk = mc_df_wk.dropna(subset=['Agreement Date-Answer'])
mc_df_wk.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458 entries, 0 to 509
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Filename               458 non-null    object
 1   Text Files             458 non-null    object
 2   Document Name          458 non-null    object
 3   Document Name-Answer   458 non-null    object
 4   Parties                458 non-null    object
 5   Parties-Answer         458 non-null    object
 6   Agreement Date         458 non-null    object
 7   Agreement Date-Answer  458 non-null    object
 8   Text                   458 non-null    object
 9   Length_Of_Text         458 non-null    int64 
 10  Doc_N_Length           458 non-null    int64 
dtypes: int64(2), object(9)
memory usage: 42.9+ KB


In [10]:
# The CUADv1 labels includes the Party definition eg Apple Inc. "Apple", here we keep just the legal entity:
def remove_party_overlaps(labels):
    labels.sort()
    k = []
    for i in range(len(labels)-1):
        l1 = labels[i]
        l2 = labels[i+1]
        if l1[0] == l2[0]:
            len1 = l1[1] - l1[0]
            len2 = l2[1] - l2[0]
            if len1 > len2:
                k.append(l1)
                continue
            else:
                k.append(l2)
                continue
        else:
            k.append(labels[i])
    new_labels = list(k for k,_ in itertools.groupby(k))
    
    return new_labels

In [11]:
# Go through each label and find the label in the text, ensure label is pre-processed same as text.
# If labels don't match, append to a seperate file to check.

clean_text = True
djson = list()
djson_inspect = list()
for index, row in tqdm(mc_df_wk.iterrows()):
    labels = list()
    ids = index
    text = row['Text']
    
    #DOC_NAME
    doc_names = row['Document Name']
    for name in doc_names:
        if clean_text:
            name = pre_process_doc_common(name)
        matches = re.finditer(re.escape(name.lower()), text.lower())
        for m in matches:
            s = m.start()
            e = m.end()
            labels.append([s, e, 'DOC_NAME'])
    
    #AGMT_DATE
    agmt_date = row['Agreement Date']
    for date in agmt_date:
        if clean_text:
            date = pre_process_doc_common(date)
        matches = re.finditer(re.escape(date.lower()), text.lower())
        for m in matches:
            s = m.start()
            e = m.end()
            labels.append([s, e, 'AGMT_DATE'])

    #PARTIES
    parties = row['Parties']
    for party in parties:
        if clean_text:
            party = pre_process_doc_common(party)
        matches = re.finditer(re.escape(party.lower()), text.lower())
        for m in matches:
            s = m.start()
            e = m.end()
            labels.append([s, e, 'PARTY'])
    
    labels = remove_party_overlaps(labels)
    #print(labels)
    
    # Check for incongruous finds, add to inspect file
    flat_list = [item for sublist in labels for item in sublist]
    if 'DOC_NAME' in flat_list and 'AGMT_DATE' in flat_list and 'PARTY' in flat_list:
        djson.append({'id': ids, 'text': text, "labels": labels})
    else:
        djson_inspect.append({'id': ids, 'text': text, "labels": labels})

# Add to the check JSON file the other documents excluded due to duplicate names and no agreement dates
for index, row in tqdm(mc_df_mul.iterrows()):
    labels = list()
    ids = index
    text = row['Text']
    djson_inspect.append({'id': ids, 'text': text, "labels": labels})

for index, row in tqdm(mc_df_nul.iterrows()):
    labels = list()
    ids = index
    text = row['Text']
    djson_inspect.append({'id': ids, 'text': text, "labels": labels})


458it [00:00, 1857.30it/s]
11it [00:00, 9427.33it/s]
41it [00:00, 13409.74it/s]


In [12]:
# The process above requires the three label types to be present in each agreement extract. This may not
# be the case due to the shortening of the agreememt for example. Let's check how many we are left with
# and how many we need to manually check...
print(f"We are left with {len(djson)} training samples out of 510 to annotate.")
print("Additional agreements to check: ",len(djson_inspect))

We are left with 349 training samples out of 510 to annotate.
Additional agreements to check:  161


In [13]:
# Check for erroneous labels
count = 0
for n in range(len(djson)):
    labs = djson[n]['labels']
    flat_list = [item for sublist in labs for item in sublist]
    if -1 in flat_list:
        count += 1
print(count)

0


In [14]:
# Export the full datasets for import to Doccano
filepath = JSON_EXPORT
open(filepath, 'w').write("\n".join([json.dumps(e) for e in djson]))

filepath = JSON_EXPORT_INSPECT
open(filepath, 'w').write("\n".join([json.dumps(e) for e in djson_inspect]))

187442

#### Using Doccano to tag the text file dataset:
 - Install doccano at the command line: pip install doccano
 - At the command line change the directory to this directory
 - run doccano at the command line by typing 'doccano'
 - Application will be running at http://0.0.0.0:8000/
 - Username is 'admin', passowrd is 'password'
 - Use ctrl-c to end application