In [2]:
import numpy as np
from IPython.display import display
import os
import pandas as pd
import re
import json
import sys
sys.path.append("..")
import di_metrics as dim
from di_metrics._hed import hed
from di_metrics._geometric import iou
from di_metrics._textual import levenshtein_distance, lc_subsequence, str_exact_match
from di_metrics._utils import get_dicts, get_item_df, process_cord_file,process_pred_file


ModuleNotFoundError: No module named 'di_metrics._utils'

In [3]:
import di_metrics
dir(di_metrics)


['__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_geometric',
 '_hed',
 '_textual',
 'cumulative_lcs',
 'docbank_overlap',
 'hed',
 'iou',
 'lc_subsequence',
 'levenshtein_distance',
 'line_item_edit_distance',
 'str_exact_match']

# Input

In [2]:
FR = True
cord=True

In [3]:
GT_DIR = '/Users/ng492mh/Documents/DI_Metrics_proj/CORD_data/cord_data_jsons/test/'
# layoutlm
if not FR and not cord:
    ListPrefix = 'item'
    PRED_DIR = '/Users/ng492mh/Documents/DI_Metrics_proj/cdip_layoutlm_pred_psl/'
if not FR and cord:
    CORD_lineitems = [ "menu.cnt", "menu.nm", "menu.price", "menu.unitprice"]
    CORD_other_in_hed = ['subtotal.subtotal_price', 'sub_total.tax_price','total.total_price']
    PRED_DIR = '/Users/ng492mh/Documents/DI_Metrics_proj/cord_layoutlm_pred_psl_epoch7/'

In [4]:
# FR
if FR and not cord:
    multiplier = 300
    ListPrefix = 'Item'
    field_mapping = {
        'main|||vendor_name||primary_fields': 'VendorName',
        'main|||buyer_name||primary_fields': 'CustomerName',
        'main|||invoice_number||primary_fields': 'InvoiceId',
        'main|||total_amount||primary_fields':'InvoiceTotal',
        'main|||issued_date||primary_fields':'InvoiceDate',
        'main|||item_description||line_items': 'ItemDescription', 
        'main|||item_total||line_items':'ItemTotal',
        'main|||Item_unit_count||line_items': 'ItemCount',
    #     'vendor_address_combined':'VendorAddress',
    #     'buyer_address_combined': 'CustomerAddress',
        'Description': 'ItemDescription',
        'Amount': 'ItemTotal',
        'Quantity': 'ItemCount',
    }
    PRED_DIR = '/Users/ng492mh/Downloads/FR_cdip_preds/'
if FR and cord:
    multiplier = 1
    ListPrefix = 'Item'
    field_mapping = {
        'subtotal.subtotal_price' : 'Subtotal',
        'sub_total.tax_price': 'Tax',
        'total.total_price': 'Total',
        'menu.nm': 'ItemName',
        'menu.unitprice': 'ItemPrice',
        'menu.cnt': 'ItemQuantity',
        'menu.price':'ItemTotalPrice',
        'Name': 'ItemName',
        'Price': 'ItemPrice',
        'Quantity': 'ItemQuantity',
        'TotalPrice':'ItemTotalPrice'}
    PRED_DIR = '/Users/ng492mh/Downloads/fr_CORD_test_preds/'

# Global variables

In [5]:
line_threshold = 25
iou_threshold = 0.51
pattern = re.compile('[\W_]+')
address_labels = set(['main|||vendor_address_street||primary_fields',
       'main|||vendor_address_city||primary_fields',
       'main|||vendor_address_postal_code||primary_fields',
       'main|||vendor_address_country||primary_fields',
       'main|||buyer_address_city||primary_fields',
       'main|||buyer_address_street||primary_fields',
       'main|||buyer_address_postal_code||primary_fields',
       'main|||buyer_address_country||primary_fields'])

# Pre-process GT

In [6]:
def process_cdip_gt(gt_file, pdf_filename=None):

    with open(gt_file) as f:
        gt_dict = json.load(f)

    ## Adding most of the main fields to dataframe
    gt_df = pd.DataFrame(gt_dict['1'])
    gt_df['bbox'] = gt_df['bounding_box'].apply(lambda x: [x[0], x[1], x[0]+x[2], x[1] + x[3]])
    gt_df['text'] = gt_df['bounding_box'].apply(lambda x: x[-1])
    gt_df = gt_df.rename(columns={'classification_label': 'label'})
    gt_df = gt_df.drop(columns=['activity_label', 'bounding_box', 'class_probability'])
    if pdf_filename:
        gt_df['file'] = pdf_filename
    return gt_df

# Pre-process Pred

In [7]:
def combine_text(gt_df):
    gt_df['X'] = gt_df['bbox'].apply(lambda x: x[0])
    new_gt_df =pd.DataFrame()
    for i, group in gt_df.groupby(['group_label']): 
        for label in group['label'].unique():
            g = group[group['label'] == label]
            if g.shape[0] > 1:
                group_label = g.iloc[0]["group_label"]
                g = g.sort_values(by='X')
                combined_bbox = [np.inf, np.inf, -np.inf, -np.inf]
                combined_text = ""

                for j, row in g.iterrows():
                    combined_bbox[0] = min(row['bbox'][0], combined_bbox[0])
                    combined_bbox[1] = min(row['bbox'][1], combined_bbox[1])
                    combined_bbox[2] = max(row['bbox'][2], combined_bbox[2])
                    combined_bbox[3] = max(row['bbox'][3], combined_bbox[3])

                    combined_text += row['text'] + ' '
                new_gt_df = new_gt_df.append(pd.DataFrame([[combined_bbox, combined_text.strip(), label,group_label]], columns=['bbox','text','label','group_label']),ignore_index=True, sort=True)
            else:
                new_gt_df = new_gt_df.append(g.drop(columns=['X']), ignore_index=True, sort=True)
    return new_gt_df

In [8]:
def assign_group(df, line_threshold):
    df["group_label"] = 0
    temp_df = df.loc[df["label"].apply(lambda x: ListPrefix in x), :]
    if temp_df.shape[0] == 0:
        return df
    notline_df = df.loc[df["label"].apply(lambda x: ListPrefix not in x), :]
    i = 1
    temp_df["Y"] = temp_df["bbox"].apply(lambda x: x[1])
    temp_df.sort_values(by="Y", inplace=True)
    prev = temp_df.iloc[0]["Y"]
    group_label = 1
    new_temp_df = pd.DataFrame()
    for i, row in temp_df.iterrows():
        if row["Y"] - prev <=line_threshold:
            row["group_label"] = group_label
        else:
            group_label += 1
            row["group_label"] = group_label
            prev = row["Y"]
        new_temp_df = new_temp_df.append(row,ignore_index=True, sort=True)
    
    return new_temp_df.append(notline_df, ignore_index=True, sort=True) 

In [9]:
def process_layoutlm_pred_file(gt_file, line_threshold, pdf_filename=None, cord=False):
    with open(gt_file) as f:
        gt_dict = json.load(f)

    ## Adding most of the main fields to dataframe
    gt_df = pd.DataFrame(gt_dict["predictions"])
    gt_df['bbox'] = gt_df['bounding_box'].apply(lambda x: [x[0], x[1], x[0] + x[2], x[1] + x[3]])
    gt_df['text'] = gt_df['bounding_box'].apply(lambda x: x[-1])
    
    gt_df = gt_df.rename(columns={'classification_label': 'label'})
    gt_df['label'] = gt_df['label'].apply(lambda x: x.replace("S-",""))
    gt_df = gt_df[gt_df['label'] != "No class"]
    if gt_df.shape[0] == 0:
        return pd.DataFrame()
    gt_df = gt_df[gt_df['label'].notna()]
    gt_df = gt_df.drop(columns=['bounding_box', 'class_probability', 'PSL_classification_label', 'box_id','scores'],errors='ignore') 
    ####fixme
    gt_df = gt_df.drop(columns=['PSL_lineitem_number'], axis=1)
    if 'PSL_lineitem_number' in gt_df.columns:
        gt_df = gt_df.rename(columns={'PSL_lineitem_number': 'group_label'})
        gt_df.loc[gt_df["group_label"] == "", "group_label"] = 0
    else:
        gt_df = assign_group(gt_df, line_threshold)
   
    if not cord:
        new_gt_df = combine_text(gt_df)
    else:
        new_gt_df = gt_df
    if pdf_filename:
        new_gt_df['file'] = pdf_filename
 
    return new_gt_df

# Get matched DataFrame

In [10]:
def box_inside(y_true, y_pred):
    xA = max(y_true[0], y_pred[0])
    yA = max(y_true[1], y_pred[1])
    xB = min(y_true[2], y_pred[2])
    yB = min(y_true[3], y_pred[3])
    inset = [xA,yA,xB,yB]
    if all([inset[i] == y_true[i] for i in range(4)]) or all([inset[i] == y_pred[i] for i in range(4)]):
        return True
    else:
        return False
    

In [11]:
def get_matched_gt_pred(gt_df, pred_df):
    gt_df.drop(columns=['file'], axis=1, inplace=True, errors='ignore')
    pred_df.drop(columns=['file'], axis=1, inplace=True, errors='ignore')
    gt_df["X"] = gt_df['bbox'].apply(lambda x: x[0])
    gt_df["Y"] = gt_df['bbox'].apply(lambda x: x[1])
    gt_df['gt_id'] = gt_df.index
    gt_df.rename(columns={'group_label':'gt_group_label','bbox':'gt_bbox'}, inplace=True)
    pred_df["X"] = pred_df['bbox'].apply(lambda x: x[0])
    pred_df["Y"] = pred_df['bbox'].apply(lambda x: x[1])
    pred_df['pred_id'] = pred_df.index
    pred_df.rename(columns={'group_label':'pred_group_label','bbox':'pred_bbox'}, inplace=True)
    # Get same (Label, X,Y,Text) data
    common = gt_df.merge(pred_df, on=['label','text','X','Y'], how='inner')
    common.drop(columns=["X","Y"], axis=1, inplace=True)
    common.rename(columns={'text':'gt_text'}, inplace=True)
    common.loc[:, 'pred_text'] = common.loc[:, 'gt_text']
    gt_rest = gt_df[gt_df.gt_id.apply(lambda x: x not in common.gt_id.tolist())]
    gt_rest.drop(columns=["X","Y"], axis=1, inplace=True)
    pred_rest = pred_df[pred_df.pred_id.apply(lambda x: x not in common.pred_id.tolist())]
    pred_rest.drop(columns=["X","Y"], axis=1, inplace=True)
    

    if gt_rest.shape[0] == 0:
        gt_df.drop(columns=['gt_id'], axis=1, inplace=True)
        pred_rest.drop(columns=['pred_id'], axis=1, inplace=True)
        combined_df = gt_df.append(pred_rest, sort=True, ignore_index=True)
        return combined_df
    else:
        used_gt_ids = []
        used_pred_ids = []
        mix_df = pd.DataFrame(columns=['label','gt_group_label','gt_text','gt_bbox','pred_text','pred_bbox','pred_group_label'])
        add_idx = 0
        # If there are gt data which is not in pred, check if there is similar bbox which should be the identical one.
        for j, gt_row in gt_rest.iterrows():
            gt_label = gt_row['label']
            for i, pred_row in pred_rest.iterrows():
                if pred_row['label'] == gt_label:
                    ious = iou(gt_row['gt_bbox'], pred_row['pred_bbox']) 
                    if (ious >= iou_threshold) or (box_inside(gt_row['gt_bbox'], pred_row['pred_bbox'])):
                        used_pred_ids.append(pred_row['pred_id'])
                        used_gt_ids.append(gt_row['gt_id'])
                        mix_df.loc[add_idx] = (gt_label, gt_row['gt_group_label'], gt_row['text'], gt_row['gt_bbox'],
                                               pred_row['text'], pred_row['pred_bbox'], pred_row['pred_group_label'])
        
                        add_idx += 1
                    else:
                        continue
                else:
                    continue
        # Append all pred data which has no matching gt data
        pred_nomatch = pred_rest[pred_rest.pred_id.apply(lambda x: x not in used_pred_ids)]
        pred_nomatch.rename(columns={'text':'pred_text'}, inplace=True)
        gt_nomatch = gt_rest[gt_rest.gt_id.apply(lambda x: x not in used_gt_ids)]
        gt_nomatch.rename(columns={'text':'gt_text'}, inplace=True)
        combined_df = common.append([mix_df, pred_nomatch, gt_nomatch], sort=True, ignore_index=True)
        combined_df.drop(columns=['pred_id','gt_id'], axis=1, inplace=True)
        return combined_df

# HED Pre-processing

In [12]:
def sort_line_items(item_df):
    if len(item_df) == 0:
        return []
    ans = []
    for i, row in item_df.sort_values(by=['mid_y']).iterrows():
        d = {}
        for j, col in row.iteritems():
            if type(col) == dict:
                d[j] = col['text']
        ans.append(d)
    return ans

In [13]:
def get_line_item_dic(df, cord=False, FR=False): 
    df = df[df['label'].apply(lambda x: x not in address_labels)]
    dic = {}
    if cord:
        df = combine_text(df)
    if not FR and cord:
        # only for layoutlm pred for cord data
        item_df = df[df['label'].apply(lambda x:x in CORD_lineitems)]
        other_df = df[df['label'].apply(lambda x: x in CORD_other_in_hed)]
    else:
        item_df = df[df['label'].apply(lambda x: ListPrefix in x)]
        other_df = df[df['label'].apply(lambda x: ListPrefix not in x)]
    for i,row in other_df.iterrows():
        dic[row['label']] = row['text']
    if len(item_df) > 0:
        item_df = get_item_df(item_df)
    dic['Items'] = sort_line_items(item_df)
    return dic
        

# Rule to match ground truth and prediction

In [14]:
def get_merged_df(PRED_DIR,GT_DIR, line_threshold=25, cord=False):
    final_df = pd.DataFrame()
    gt_hed_lists = []
    pred_hed_lists = []
    file_num = 0
    for pred_file in os.listdir(PRED_DIR):
        if pred_file == '.DS_Store':
            continue         
        if pred_file[:-5][-12:] != "pred_grouped":
            continue   
        file_prefix = pred_file[:-5][:-len("_abbyy_pred_grouped")]
        pred_file = os.path.join(PRED_DIR, pred_file)
        if cord:
            gt_file = os.path.join(GT_DIR, file_prefix+'.json')
        else:
            gt_file = os.path.join(GT_DIR, file_prefix+'.pdf_annotations.json')
        if not os.path.exists(gt_file):
            print(f"{gt_file} doesn't exist")
            continue
  
        pred_df=process_layoutlm_pred_file(pred_file, line_threshold, os.path.basename(pred_file),cord)
        if pred_df.shape[0] == 0:
            print(f"{pred_file} has no preds")
            continue
        file_num += 1
        if cord:
            gt_df = process_cord_file(gt_file, pdf_filename=os.path.basename(gt_file))
        else:
            gt_df = process_cdip_gt(gt_file, pdf_filename=os.path.basename(gt_file))

        print(gt_file)
        pred_li = get_line_item_dic(pred_df, cord.FR)
        gt_li = get_line_item_dic(gt_df, cord,FR)
        pred_hed_lists.append(pred_li)
        gt_hed_lists.append(gt_li)
        
        df = get_matched_gt_pred(gt_df, pred_df)
        df.loc[:, 'file'] = os.path.basename(gt_file)
        final_df = final_df.append(df, sort=True, ignore_index=True)     
    final_df = final_df.fillna(0)
    final_df['pred_text'] = final_df['pred_text'].replace(0, '')
    final_df['gt_text'] = final_df['gt_text'].replace(0, '')
    return final_df, file_num, gt_hed_lists, pred_hed_lists


In [15]:
def get_merged_df_FR(PRED_DIR,GT_DIR,cord):
    final_df = pd.DataFrame()
    file_num = 0
    gt_hed_lists = []
    pred_hed_lists = []
    for pred in os.listdir(PRED_DIR):
        if pred == '.DS_Store':
            continue   
        if cord:
            gt_file = os.path.join(GT_DIR, pred)
        else:
            gt_file = os.path.join(GT_DIR, pred[:-5]+'.pdf_annotations.json')
        pred_file = os.path.join(PRED_DIR, pred)
        if not os.path.exists(gt_file):
            continue
        pred_df,item_df,_,_ = process_pred_file(pred_file, pdf_filename=pred,multiplier=multiplier)
        pred_df = pred_df[pred_df['label'].notna()]
        if len(item_df) > 0:
            item_df['label'] = item_df['label'].map(field_mapping)
            item_df = item_df[item_df['label'].notna()]
            pred_df = pred_df.append(item_df, sort=True, ignore_index=True)
        if pred_df.shape[0] == 0:
            continue
        file_num += 1
        if cord:
            gt_df = process_cord_file(gt_file, os.path.basename(gt_file))
        else:
            gt_df = process_cdip_gt(gt_file, os.path.basename(gt_file))
        gt_df['label'] = gt_df[gt_df['label'].apply(lambda x: x in field_mapping.keys())]
        gt_df['label'] = gt_df['label'].map(field_mapping)
        gt_df = gt_df[gt_df['label'].notna()]
        pred_df = pred_df[pred_df['label'].apply(lambda x: x in field_mapping.values())]
        pred_df = pred_df[pred_df['label'].notna()]
        
        pred_li = get_line_item_dic(pred_df,cord,FR)
        gt_li = get_line_item_dic(gt_df,cord,FR)
        pred_hed_lists.append(pred_li)
        gt_hed_lists.append(gt_li)
        
        df = get_matched_gt_pred(gt_df, pred_df)
        df.loc[:, 'file'] = os.path.basename(pred)
        final_df = final_df.append(df, sort=True, ignore_index=True)
            
    final_df = final_df[final_df['label'].notna()]
    final_df = final_df.fillna(0)
    final_df['pred_text'] = final_df['pred_text'].replace(0, '')
    final_df['gt_text'] = final_df['gt_text'].replace(0, '')
    return final_df, file_num, gt_hed_lists, pred_hed_lists

# Match ground truth and prediction

In [16]:
if FR:
    final_df,file_num, gt_hed_lists, pred_hed_lists = get_merged_df_FR(PRED_DIR,GT_DIR,cord)
else:
    final_df,file_num, gt_hed_lists, pred_hed_lists = get_merged_df(PRED_DIR,GT_DIR,line_threshold,cord)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
final_df[final_df['label']==0]

Unnamed: 0,file,gt_bbox,gt_group_label,gt_text,label,pred_bbox,pred_group_label,pred_text


In [18]:
file_num

93

In [19]:
final_df.shape

(1013, 8)

In [20]:
final_df[final_df['pred_text']==''].shape

(180, 8)

In [21]:
final_df[final_df['gt_text']==''].shape

(136, 8)

In [46]:
final_df.head()

Unnamed: 0,file,gt_bbox,gt_group_label,gt_text,label,pred_bbox,pred_group_label,pred_text
0,CORD_001_test_receipt_00056.json,"(250, 1050, 562, 1089)",3,AIR MINERAL,ItemName,"[251.4, 1053.6, 553.6, 1088.4]",1,AIR MINERAL
1,CORD_001_test_receipt_00056.json,"(787, 1045, 928, 1081)",3,8181,ItemTotalPrice,"[768, 1044, 926, 1083]",1,"· 8, 181"
2,CORD_001_test_receipt_00056.json,"(824, 1136, 928, 1174)",4,818,Tax,"[819, 1136, 916, 1173]",0,· 818
3,CORD_001_test_receipt_00056.json,"(612, 1182, 922, 1222)",5,8999,Total,"[608.9, 1182, 893.1, 1221.5]",0,"· 8,999"
4,CORD_001_test_receipt_00056.json,0,0,,Subtotal,"[766.1, 1084.5, 924.9, 1130.3]",0,"· 8, 181"


# IoU metrics

In [23]:
def get_iou_metrics(df):
    if not df['gt_bbox']:
        df['FP'] = 1
        ious = 0
    elif not df['pred_bbox']:
        df['FN'] = 1
        ious = 0
    else:
        ious = iou(df['gt_bbox'], df['pred_bbox'])
        df['TP'] = 1
        num = 1
    df['iou'] = ious
    return df

In [24]:
def get_precision(df):
    deno = df['TP'] + df['FP']
    if deno == 0:
        return 0
    precision = df['TP'] / deno
    return precision
def get_recall(df):
    deno = df['TP'] + df['FN']
    if deno == 0:
        return 0
    recall = df['TP'] / deno
    return recall
def get_f1(df,prec_col, recall_col):
    deno = df[prec_col] + df[recall_col]
    if deno == 0:
        return 0
    f1 = 2*df[prec_col]*df[recall_col] / deno
    return f1
def get_iou_label(df):
    d = {}
    TP = sum(df["TP"])
    FN = sum(df["FN"])
    FP = sum(df["FP"])
    count = len(df)
    precision = 0 if TP+FP == 0 else TP/(TP+FP)
    recall = 0 if TP+FN==0 else TP/(TP+FN)
    f1 = 0 if precision+recall==0 else 2*precision*recall/(precision+recall)
    d['iou_precision'] = precision
    d['iou_recall']=recall
    d['iou_f1']=f1
    d['support'] = int(count)
    return pd.Series(d, index=['iou_precision', 'iou_recall', 'iou_f1', 'support'])


In [25]:
def get_iou_weighted_mean(iou_metrics_df,prefix, volumn_col):
    total = iou_metrics_df[volumn_col].sum() + 1e-10
    prec_mean = sum(iou_metrics_df[prefix+'_precision']*iou_metrics_df[volumn_col]) / total
    rec_mean = sum(iou_metrics_df[prefix+'_recall']*iou_metrics_df[volumn_col]) / total
    f1_mean = sum(iou_metrics_df[prefix+'_f1']*iou_metrics_df[volumn_col]) / total
    return pd.DataFrame([{'prec_mean':round(prec_mean,2), 'recall_mean':round(rec_mean,2),'f1_mean':round(f1_mean,2)}])

In [26]:
def get_iou_metrics_df(df):
    df = df.loc[:, ['label','gt_bbox', 'pred_bbox']]
    metrics_df = df.apply(get_iou_metrics, axis=1).fillna(0)
    iou_metrics_df=metrics_df[['FN','FP','TP','label']].groupby('label').apply(get_iou_label)
    mean_df = get_iou_weighted_mean(iou_metrics_df,'iou','support')
    return iou_metrics_df, mean_df

In [27]:
iou_metrics_df, iou_mean_df = get_iou_metrics_df(final_df)

In [28]:
iou_metrics_df

Unnamed: 0_level_0,iou_precision,iou_recall,iou_f1,support
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ItemName,0.873418,0.880851,0.877119,265.0
ItemPrice,0.97619,0.621212,0.759259,67.0
ItemQuantity,0.880503,0.657277,0.752688,232.0
ItemTotalPrice,0.902655,0.886957,0.894737,252.0
Subtotal,0.0,0.0,0.0,58.0
Tax,0.958333,0.522727,0.676471,45.0
Total,0.942529,0.921348,0.931818,94.0


In [29]:
iou_mean_df

Unnamed: 0,prec_mean,recall_mean,f1_mean
0,0.85,0.75,0.79


# Text-Metrics

In [30]:
def get_text_metrics(df):
    df = df.loc[:, ['pred_text','gt_text','label']]
    df['direct'] = (df['gt_text'] == df['pred_text']).astype(float)
    df["gt_text_alpha"] = df["gt_text"].apply(lambda x: pattern.sub('',x)) # Remove non-alphanumeric characters
    df["pred_text_alpha"] = df["pred_text"].apply(lambda x: pattern.sub('',x)) # Remove non-alphanumeric characters
    df['direct_alpha'] = (df['gt_text_alpha'] == df['pred_text_alpha']).astype(float)
    df['levenshtein'] = df.apply(lambda x: levenshtein_distance(x['gt_text'], x['pred_text'], normalize=True), axis=1)
    df['levenshtein_alpha'] = df.apply(lambda x: levenshtein_distance(x['gt_text_alpha'], x['pred_text_alpha'], normalize=True), axis=1)
    df['lcsubsequence'] = df.apply(lambda x: lc_subsequence(x['gt_text'], x['pred_text']), axis=1)
    df['TP'] = df['lcsubsequence'].apply(lambda x:x[0])
    df['FP'] = df['lcsubsequence'].apply(lambda x:x[1])
    df['FN'] = df['lcsubsequence'].apply(lambda x:x[2])
    df['lcs_precision'] = df.apply(get_precision, axis=1)
    df['lcs_recall'] = df.apply(get_recall, axis=1)
    df['lcs_f1'] = df.apply(lambda x:get_f1(x, 'lcs_precision', 'lcs_recall'), axis=1) 
    counts = pd.Series(df.groupby(['label']).count()['gt_text'], name='count')
    df.drop(columns=['pred_text','gt_text','TP','FP','FN','gt_text_alpha','pred_text_alpha'], inplace=True)
    metrics_df = pd.merge(df.groupby(['label']).mean(), counts, on=['label'])
    metrics_df['direct_alpha'] = metrics_df['direct_alpha'].apply(lambda x: round(x,6))
    metrics_df['levenshtein'] = metrics_df['levenshtein'].apply(lambda x: round(x,6))
    metrics_df['levenshtein_alpha'] = metrics_df['levenshtein_alpha'].apply(lambda x: round(x,6))
    return metrics_df



In [31]:
lcs_df=get_text_metrics(final_df)

In [32]:
lcs_df

Unnamed: 0_level_0,direct,direct_alpha,levenshtein,levenshtein_alpha,lcs_precision,lcs_recall,lcs_f1,count
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ItemName,0.664151,0.679245,0.756157,0.757759,0.771977,0.762168,0.764449,265
ItemPrice,0.567164,0.641791,0.605188,0.641791,0.605188,0.609453,0.607157,67
ItemQuantity,0.577586,0.586207,0.585206,0.592098,0.585206,0.594828,0.586806,232
ItemTotalPrice,0.746032,0.793651,0.793934,0.802646,0.794596,0.799036,0.796328,252
Subtotal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58
Tax,0.4,0.511111,0.474074,0.537284,0.481481,0.487407,0.481852,45
Total,0.787234,0.851064,0.856095,0.867173,0.856095,0.865248,0.860073,94


In [33]:
get_iou_weighted_mean(lcs_df,'lcs', 'count')

Unnamed: 0,prec_mean,recall_mean,f1_mean
0,0.67,0.68,0.67


# HED for Line items

In [35]:
len(gt_hed_lists)

93

In [36]:
len(pred_hed_lists)

93

In [37]:
gt_hed_lists[0]

{'Tax': '818',
 'Total': '8,999',
 'Items': [{'ItemName': 'AIR MINERAL', 'ItemTotalPrice': '8,181'}]}

In [38]:
pred_hed_lists[0]

{'Items': [{'ItemName': 'AIR MINERAL', 'ItemTotalPrice': '· 8, 181'}]}

In [39]:
def get_hed_document(gt_item_list, pred_item_list):
    file_results = [hed(gt_item, pred_item)[0] for gt_item, pred_item in zip(gt_item_list, pred_item_list)]
    file_hed_df = pd.DataFrame(np.stack(file_results), columns=['TP', 'FP', 'FN'])
    file_hed_df['precision'] = file_hed_df.apply(get_precision, axis=1)
    file_hed_df['recall'] = file_hed_df.apply(get_recall, axis=1)
    file_hed_df['f1-score'] = file_hed_df.apply(lambda x: get_f1(x, "precision", "recall"), axis=1)
    return file_hed_df.drop(columns=['TP', 'FP', 'FN']).mean()

In [40]:
def get_hed_label(gt_item_list, pred_item_list):
    results = [hed(gt_item, pred_item)[1] for gt_item, pred_item in zip(gt_item_list, pred_item_list)]
    hed_df = pd.concat([pd.DataFrame(result).T.reset_index()  for i, result in enumerate(results)])
    hed_df = hed_df.rename(columns={0:'TP', 1:'FP', 2:'FN'})
    hed_df['precision'] = hed_df.apply(get_precision, axis=1)
    hed_df['recall'] = hed_df.apply(get_recall, axis=1)
    hed_df['f1-score'] = hed_df.apply(lambda x: get_f1(x, "precision", "recall"), axis=1)
    label_df = hed_df.drop(columns=['TP', 'FP', 'FN']).groupby('index').mean()
    mean_df = hed_df.drop(columns=['TP', 'FP', 'FN']).mean()
    return label_df, mean_df

### Document-level Mean HED

In [41]:
get_hed_document(gt_hed_lists, pred_hed_lists)

precision    0.909858
recall       0.715767
f1-score     0.784926
dtype: float64

### Label-level HED and Mean 

In [42]:
label_df, mean_df = get_hed_label(gt_hed_lists, pred_hed_lists)


In [43]:
mean_df

precision    0.597745
recall       0.538157
f1-score     0.553726
dtype: float64

In [44]:
label_df

Unnamed: 0_level_0,precision,recall,f1-score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ItemName,0.952076,0.864072,0.887519
ItemPrice,0.724747,0.663939,0.683808
ItemQuantity,0.741972,0.602681,0.634511
ItemTotalPrice,0.906737,0.862978,0.876569
Tax,0.0,0.0,0.0
Total,0.0,0.0,0.0


# Line threshold vs Rule performance vs PSL performance

In [45]:
# final = []
# for threshold in [1,3,5,7,10,15,20,25,30,40,45,50,60,70,80]:
#     final_df,file_num, gt_item_list, pred_item_list = get_merged_df(PRED_DIR,GT_DIR, threshold)
#     data = get_hed_document(gt_item_list, pred_item_list).to_dict()
#     data["Y-distance threshold"] = threshold
#     final.append(data)
# pd.DataFrame(final).to_csv("Y_distance_threshold_vs_metrics_nopsl.csv", index=False)