## Paper 2 Data Workflow for Data Extraction - CUADv1 - Prepare Dataset

In [None]:
import re, json, os, itertools
import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import spacy
from spacy.lang.en import English
from spacy.training import offsets_to_biluo_tags # requires spaCy 3.0

In [None]:
# download transformer model for spaCy if required
# !python -m spacy download en_core_web_md

In [None]:
!python -m spacy validate # Ensure minimum v3.0.0

### 1. File handling - CUADv1

In [None]:
MASTER_PATH = "../CUAD-v1/"
JSONL_FILE = 'project_6_dataset.jsonl'
JSONL_FILE_INS = 'project_7_dataset.jsonl'
FEATURE_CLASS_LABELS = "feature_class_labels.json"
DATA_FILE = 'cuad-v1-annotated.json'

### 2. Text Data Preprocessing - CUADv1 - Continued

#### Using Doccano to tag the text file dataset:
 - Install doccano at the command line: pip install doccano
 - At the command line change the directory to this directory
 - run doccano at the command line by typing 'doccano'
 - Application will be running at http://0.0.0.0:8000/
 - Username is 'admin', passowrd is 'password'
 - Use ctrl-c to end application

#### Prepare updated dataset for fine-tuning Transformers with HuggingFace

In [None]:
# JSONL is a multi-line json file and requires lines=True parameter
# Bring in both sets of annotations and concatenate vertically 
df1 = pd.read_json (JSONL_FILE, lines=True)
df2 = pd.read_json (JSONL_FILE_INS, lines=True)

df = pd.concat([df1, df2], axis=0)
df = df1 # Use this line to exclude the additional manually checked data
df = df.drop(['meta', 'annotation_approver', 'comments'], axis=1)
df.head()

In [None]:
# Check the information and number of samples
df.info()

In [None]:
# Some samples were not annotated as they were not suitable samples.
# Eliminate any samples which were not annotated.
df_cut = df[df['labels'].map(lambda d: len(d)) > 0].copy()
df_cut.info()

In [None]:
# We tokenize each agreement prior to bringing into the transformer model
# Create tokens using spaCy
nlp = English()
df_cut['tokens'] = df_cut['text'].apply(lambda x: nlp(x))
df_cut.head()

In [None]:
# Check an example of the text indices and labels
row = df_cut.iloc[4]
doc = row['tokens']
for start, end, label in row['labels']:
    print(start, end, label)
print("\n")
print(doc)

In [None]:
# Count and visualise the amount of labels

DOC_NAME_COUNT = 0
DATE_COUNT = 0
PARTIES_COUNT = 0
for index, row in df_cut.iterrows():
    for l in row['labels']:
        if l[2] == "DOC_NAME":
            DOC_NAME_COUNT += 1
        if l[2] == "AGMT_DATE":
            DATE_COUNT += 1
        if l[2] == "PARTY":
            PARTIES_COUNT += 1

# Create DataFrame for the bar plot
data=pd.DataFrame.from_dict({"Document Name":[DOC_NAME_COUNT],
                             "Date of Agreement":[DATE_COUNT],
                             "Parties":[PARTIES_COUNT]})

# Use Seaborn for the bar plot
splot = sns.barplot(palette="pastel", data=data)
splot.set(title='Number of labels in dataset', ylabel='Count')

# Annotate the bars with the count of labels
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2.,p.get_height()), 
                   ha = 'center', va = 'center', 
                   size=10,
                   xytext = (0, -12), 
                   textcoords = 'offset points')
# Show plot
plt.show

print("The total number of labels in the dataset is:", DOC_NAME_COUNT+DATE_COUNT+PARTIES_COUNT)

In [None]:
# Check how the entity labels match up with the tokens
ents=[]

for start, end, label in row['labels']:
    if doc.char_span(start, end, label) != None:
        ent = doc.char_span(start, end, label)
        ents.append(ent)
    elif doc.char_span(start, end+1, label) != None:
        ent = doc.char_span(start, end+1, label)
        ents.append(ent)
    elif doc.char_span(start+1, end, label) != None:
        ent = doc.char_span(start+1, end, label)
        ents.append(ent)
    elif doc.char_span(start, end-1, label) != None:
        ent = doc.char_span(start, end-1, label)
        ents.append(ent)
doc.ents = ents
doc.ents

In [None]:
# Each word must be seperated for the transformer using the IOB format
# Create tags using token.ent_iob_ and add to the DataFrame
# Allow for any character misalignment between spaCy tokenization and Doccano character indices
tags_list_iob = []
for index, row in df_cut.iterrows():
    doc = row['tokens']
    ents=[]
    for start, end, label in row['labels']:
        if doc.char_span(start, end, label) != None:
            ent = doc.char_span(start, end, label)
            ents.append(ent)
        elif doc.char_span(start, end+1, label) != None:
            ent = doc.char_span(start, end+1, label)
            ents.append(ent)
        elif doc.char_span(start+1, end, label) != None:
            ent = doc.char_span(start+1, end, label)
            ents.append(ent)
        elif doc.char_span(start, end-1, label) != None:
            ent = doc.char_span(start, end-1, label)
            ents.append(ent)
        elif doc.char_span(start-1, end, label) != None:
            ent = doc.char_span(start-1, end, label)
            ents.append(ent)
    doc.ents = ents
    iob_tags = [f"{t.ent_iob_}-{t.ent_type_}" if t.ent_iob_ != "O" else "O" for t in doc]
    tags_list_iob.append(iob_tags)
df_cut['tags'] = tags_list_iob

In [None]:
# Check to ensure we have all the data (all non-null)
df_cut.info()

In [None]:
# Generate list of the IOB feature class labels from tags
all_tags = list(itertools.chain.from_iterable(tags_list_iob))

def unique(list1):
    # insert the list to the set
    list_set = set(list1)
    # convert the set to the list
    unique_list = (list(list_set))
    unique_list.sort()
    return unique_list

feature_class_labels = unique(all_tags)
print(feature_class_labels)

In [None]:
# Generate the NER index tags for each token
df_cut['ner_tags'] = df_cut['tags'].apply(lambda x: [feature_class_labels.index(tag) for tag in x])


In [None]:
# Split tokens into a list ready for CSV
df_cut['split_tokens'] = df_cut['tokens'].apply(lambda x: [tok.text for tok in x])

# Check dataframe head
df_cut.head()

In [None]:
# Export relevant columns only:
export_columns = ['id', 'ner_tags', 'split_tokens']
export_df = df_cut[export_columns]
export_df.to_json(DATA_FILE, orient="table", index=False)

In [None]:
# Export Feature Class Labels for use in Transformer fine tuning
with open(FEATURE_CLASS_LABELS, 'w') as f:
    json.dump(feature_class_labels, f, indent=2) 

#### The dataset is now ready for any transformer model!