# Metrics Example on Tax Forms from 91 K1 NJ Tax forms 

### In this notebook we compare the performance of CKY and a naive language model with no structured outputs (LayoutLM).

### We compure the metrics using our own code, currently titled DI Metrics

In [1]:
# General deps
import argparse, os
from glob import glob
from collections import OrderedDict
from tqdm import tqdm
import time
import ray
import numpy as np
import pandas as pd
import re

In [2]:
# Metric-implementations

from seqeval.metrics.sequence_labeling import get_entities #for resolving outputs from LayoutLM with no structured outputs
from metrics.metric_utils import (procCFG_XML,
                                  CFG_getdict_lists,
                                  json2dict, 
                                  xml2dict, 
                                  make_fields_list,
                                  compute_text_metrics,
                                  
                                 )

In [3]:
# start Ray instance
#count available cores and instance ray
num_cpus = len(os.sched_getaffinity(0))
ray.init(num_cpus= num_cpus, ignore_reinit_error=True)


2021-05-06 19:43:29,040	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '10.143.86.53',
 'raylet_ip_address': '10.143.86.53',
 'redis_address': '10.143.86.53:6379',
 'object_store_address': '/tmp/ray/session_2021-05-06_19-43-28_544399_33744/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-06_19-43-28_544399_33744/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-05-06_19-43-28_544399_33744',
 'metrics_export_port': 65009,
 'node_id': 'ba899cf3fa1b2b9c12ea701afe888a0c63e052ac88c42d66db5cc4d2'}

## CKY Julia 

In [4]:
parser = argparse.ArgumentParser()

parser.add_argument(
    "--gt_json_dir",
    type=str,
    default="/data/home/djonathan/ai-science-di-metrics/datasets/NJK1_91holdout_julia_2dCKY/data/test_nj/output/json/",
    help="Input directory containing list of json files",
)

parser.add_argument(
    "--preds_xml_dir",
    type=str,
    default="/data/home/djonathan/ai-science-di-metrics/datasets/NJK1_91holdout_julia_2dCKY/data/test_nj/output/xml/",
    help="Input directory containing list of json files",
)


args = parser.parse_args([])
args

Namespace(gt_json_dir='/data/home/djonathan/ai-science-di-metrics/datasets/NJK1_91holdout_julia_2dCKY/data/test_nj/output/json/', preds_xml_dir='/data/home/djonathan/ai-science-di-metrics/datasets/NJK1_91holdout_julia_2dCKY/data/test_nj/output/xml/')

In [5]:
#glob the files
gt_json_dir_files = sorted(glob(args.gt_json_dir+"*.json"))
preds_xml_dir_files = sorted(glob(args.preds_xml_dir+"*.xml"))
len(gt_json_dir_files), len(preds_xml_dir_files)

(91, 91)

In [9]:
# get the ground truths with parallel json parse (Lark Earley Parser + Ray)
#get xml files and convert them to dicts
gt_dicts_list = ray.get([json2dict.remote(f,verbose=False) for f in gt_json_dir_files])
preds_dicts = ray.get([xml2dict.remote(f,verbose=False) for f in preds_xml_dir_files]) 
field_names = make_fields_list(gt_dicts_list)      
assert len(gt_dicts_list) == len(preds_dicts)
field_names

['NjTyEndDate',
 'NjPartnerEin',
 'NjPartnerName',
 'NjPartnerAddress',
 'NjPartnerCity',
 'NjPartnerState',
 'NjPartnerZip',
 'NjPartnerEntityType',
 'NjPartnerInterestPship',
 'NjPartnershipEin',
 'NjPartnershipName',
 'NjPartnershipAddress',
 'NjPartnershipCity',
 'NjPartnershipState',
 'NjPartnershipZip',
 'NjProfitSharingI',
 'NjProfitSharingIi',
 'NjLossSharingI',
 'NjLossSharingIi',
 'NjCapitalOwnershipI',
 'NjCapitalOwnershipIi',
 'NjPartIiPshipIncomeA',
 'NjPartIiPshipIncomeB',
 'NjPartIiDistributiveShareA',
 'NjPartIiDistributiveShareB',
 'NjPartIiPartnerContributionA',
 'NjPartIiPartnerContributionB',
 'NjPartIiiNonresidentNjTax',
 'NjTyStartDate',
 'NjPartIiNetGuaranteedPmtsA',
 'NjPartIiNetGuaranteedPmtsB']

In [7]:
gt_dicts_list

[{'NjTyEndDate': ',',
  'NjPartnerEin': '13-3806691',
  'NjPartnerName': 'BlackRock Financial Management, Inc.',
  'NjPartnerAddress': '601 Union Street, 56th Floor',
  'NjPartnerCity': 'Seattle',
  'NjPartnerState': 'WA',
  'NjPartnerZip': '98101',
  'NjPartnerEntityType': 'FCG',
  'NjPartnerInterestPship': '11 2017',
  'NjPartnershipEin': '47-2367935',
  'NjPartnershipName': '1893 Fund LLC',
  'NjPartnershipAddress': '40 East 52nd Street',
  'NjPartnershipCity': 'New York',
  'NjPartnershipState': 'NY',
  'NjPartnershipZip': '10022',
  'NjProfitSharingI': '0.018108%',
  'NjProfitSharingIi': '0.021000 %',
  'NjLossSharingI': '0.018108%',
  'NjLossSharingIi': '0.021000 %',
  'NjCapitalOwnershipI': '0.020034%',
  'NjCapitalOwnershipIi': '0.570258%',
  'NjPartIiPshipIncomeA': '179,303',
  'NjPartIiPshipIncomeB': '(2)',
  'NjPartIiDistributiveShareA': '179,303',
  'NjPartIiDistributiveShareB': '(2)'},
 {'NjTyEndDate': ',',
  'NjPartnerEin': '13-3806691',
  'NjPartnerName': 'BLACKROCK FINA

In [8]:
a = CFG_getdict_lists(gt_dicts_list,preds_dicts,field_names)
compute_text_metrics(a,field_names)

100%|██████████| 91/91 [00:00<00:00, 24030.83it/s]
  0%|          | 0/91 [00:00<?, ?it/s]


KeyError: 'NjPartIiPartnerContributionA'

#### Get dict lists and computer metrics

In [None]:
cfg_master_dict = compute_text_metrics(CFG_getdict_lists(gt_dicts_list,preds_dicts,field_names),field_names)
cfg_master_dict

#### Output CFG to DF

In [None]:
# Table contains full set of all file Id, all classes
column_names = ['FileID','Filename','FieldName','String_Actual','String_Predict','Box_Actual','Box_Predict']
cky_eval_df = pd.DataFrame(columns = column_names)
cky_eval_df.head()

In [None]:
gt_df = pd.DataFrame(gt_list)
pred_df = pd.DataFrame(pred_list)
gt_df = gt_df.rename(columns={'text':'gt_text'}).drop(columns=['bbox', 'page'])
pred_df = pred_df.rename(columns={'text':'pred_text'}).drop(columns=['bbox', 'page'])
text_df = pd.merge(gt_df, pred_df, on=['label', 'file'])
# text_df = text_df.drop(columns=['label', 'file'b])

In [None]:
print("Direct Match Accuracy:", sum(text_df['gt_text'] == text_df['pred_text']) / len(text_df))

In [None]:
text_df['direct'] = (text_df['gt_text'] == text_df['pred_text']).astype(float)
text_df.head()

In [10]:
pattern = re.compile('[\W_]+')
text_df["pred_text_lower"] = text_df["pred_text"].apply(lambda x: pattern.sub('',x).lower())
text_df.head()

NameError: name 're' is not defined

In [None]:
text_df["gt_text_lower"] = text_df["gt_text"].apply(lambda x: pattern.sub('',x).lower())
text_df['direct_lower'] = (text_df['gt_text_lower'] == text_df['pred_text_lower']).astype(float)

In [None]:
text_df['levenshtein'] = text_df.apply(lambda x: levenshteinDistance(x['gt_text'], x['pred_text']), axis=1)

In [None]:
text_df['levenshtein_lower'] = text_df.apply(lambda x: levenshteinDistance(x['gt_text_lower'], x['pred_text_lower']), axis=1)

In [None]:
print("Field accuracies:")
text_df.describe()

# LayoutLM

LayoutLM outputs with an unconstrained softmax/logistic regression, assigning a class label for each bounding box it does inference with. As the model is not constrained and can produce multiple outputs of same class for one sequence/page.

We develop a naive implementation to test the LayoutLM token classification model (NER) on slot filling task, by simply taking the first instance we findof the predicted class as iterate through the output list of predictions.

In [None]:
#Trivial example
from seqeval.metrics.sequence_labeling import get_entities
class_list = ['O','S-Address']
seq_dict = {}
seq1 = ['S-Address', 'S-PartnerEIN', 'S-PartnerName', 'S-Address']
seq_spans = get_entities(seq1)
for i,j in zip(seq1, seq_spans):
    if i not in seq_dict.keys():
        seq_dict[i] = j[1:3]
print(seq1, len(seq1))
print((seq_spans), len(seq_spans))
print((seq_dict), len(seq_dict))

In [None]:
# Input format for LayoutLM outputs currently is output csv/excel
layoutlm_pred_file = "/data/home/djonathan/ai-science-di-metrics/datasets/NJK1_91holdout_Pytorch_LayoutlmLinearClassifier/NJ_K1_Client_Data_UoA_fixed_with_BIOES_orig_test_csv_with_predictions_LD.xlsm"
layoutlm_pred_df = pd.read_excel(layoutlm_pred_file, index_col=0,sheet_name='raw') 
layoutlm_pred_df.head()

In [None]:
field_names = list(set(layoutlm_pred_df['Profile_Map_Name']))
field_names

#### Make dict of retreived predicted vs actual values for LayoutLM.

In [None]:
output_dict = {}
g = list(layoutlm_pred_df.groupby('sentence_id').groups.values())
sentence_id_list = layoutlm_pred_df.groupby('sentence_id').groups.keys()
dflist = (layoutlm_pred_df.loc[x] for x in g)

for df_,sent_id in zip(dflist,sentence_id_list):
    gt_list, pred_list = [], []
    
    gt_index_list = get_entities(list(df_['labels'].replace(np.nan, 'O', regex=True)))
    pred_index_list = get_entities(list(df_['Predicted_Map_Name'].replace(np.nan, 'O', regex=True)))
    
    for i,j in zip(gt_index_list,pred_index_list):
        gt_list.append([list(i)[0],df_['words'].iloc[i[1]]])
        pred_list.append([list(j)[0],df_['words'].iloc[j[1]]])
        
    output_dict[sent_id] = gt_list,pred_list


# for df_,sent_id in zip(dflist,sentence_id_list):
#     gt_list, pred_list = [], []
#     gt_index_list = get_entities(list(df_['labels'].replace(np.nan, 'O', regex=True)))
#     pred_index_list = get_entities(list(df_['Predicted_Map_Name'].replace(np.nan, 'O', regex=True)))
#     for i,j in zip(gt_index_list,pred_index_list):
#         gt_list.append([list(i),df_['words'].iloc[i[1]],df_['labels'].iloc[i[1]]])
#         pred_list.append([list(j),df_['words'].iloc[j[1]],df_['labels'].iloc[j[1]]])
        
#     output_dict[sent_id] = [gt_list,pred_list]
    
output_dict['5e9e96bd05a1d2043eeccd31_12'][0]

In [None]:

output_dict['5e9e96bd05a1d2043eeccd31_12'][1]

### Output LayoutLM to DF

In [None]:

column_names = ['FileID','Filename','FieldName','String_Actual','String_Predict','Box_Actual','Box_Predict']
layoutlm_eval_df = pd.DataFrame()
index_list = []
for count, i in enumerate(sentence_id_list):
    for j in field_names:
        index_list.append([i,j])
assert len(index_list) == len(field_names) *len(sentence_id_list)

In [None]:
layoutlm_eval_df['FileID'] = [i[0] for i in index_list]
layoutlm_eval_df['FieldName'] = [i[1] for i in index_list]
layoutlm_eval_df['Predicted'] = ''
layoutlm_eval_df['Actual'] = ''

In [None]:
layoutlm_eval_df.head()

In [None]:
pred_list_df = []
count = 0
for i,j in enumerate(layoutlm_eval_df['FileID']):
    file_gts = output_dict[layoutlm_eval_df.iloc[count]['FileID']][0]
    pred_gts = output_dict[layoutlm_eval_df.iloc[count]['FileID']][1]
    
    count +=1
        
    

pred_gts

In [None]:
file_dict