In [1]:
### This notebook applies header detection and table picking methods to Detectron and Textract output

In [1]:
def extra_row_remover(df):
    #Drop rows that have only one filled-in value
    df['na_count'] = df.isnull().sum(axis=1)
    df = df[df.na_count < (len(df.columns)-2)]
    df = df.drop('na_count', axis=1).reset_index(drop=True)
    return df

In [2]:
def extraction_postproc(path):
    # Process documents
    if 'detectron' in str(path):
        df = pd.read_csv("s3:/" + str(path)).iloc[:, 1:] # For detectron, drop first (index) column
        df = extra_row_remover(df)
        # Use shape of table heuristic to determine if the table is useful or not
        # if row*col count <=15, classify table as header or other information
        # Analysis was conducted and documented in Damian Doc Extraction Whiteboard
        if df.shape[0] * df.shape[1] <= 15:
            # TODO: document that this table was labelled as extraneous somehow
            print(f'Table: {path} contains minimal data, not processed')
        else:
            df_detectron = nlp_proc.header_detector(df, mf_rr_kb).replace_header()
            _,zscores = nlp_proc.header_detector(df, mf_rr_kb).get_header_candidates()
            zscores = pd.DataFrame(zscores, columns=['idx','v','z'])
            # Save output to S3
            df_detectron.to_csv(postproc_output_dir + "detectron/" + UUID + "/" + str(path).split('/')[-1])
            zscores.to_csv(postproc_output_dir + "detectron/" + UUID + "/_zscores_" + str(path).split('/')[-1])

    elif 'textract' in str(path):
        with path.open() as f:
            content = f.readlines()
        index = [x for x in range(len(content)) if 'Table' in content[x]]
        if len(index) > 1:
            row_len = [len(x) for x in content]
            print(f"Table: {path} has multiple subtables, code in development")
        else:
                df = pd.read_csv("s3:/" + str(path), skiprows=1, header=None)
                df = extra_row_remover(df)
                df_textract = nlp_proc.header_detector(df, mf_rr_kb).replace_header()
                _,zscores = nlp_proc.header_detector(df, mf_rr_kb).get_header_candidates()
                zscores = pd.DataFrame(zscores, columns=['idx','v','z'])
                # Save output to S3
                df_textract.to_csv(postproc_output_dir + "textract/" + UUID + "/" + str(path).split('/')[-1])
                zscores.to_csv(postproc_output_dir + "textract/" + UUID + "/_zscores_" + str(path).split('/')[-1])
    else:
        print('Extraction source not recognized')

In [3]:
# Textract can have multiple tables in a single CSV - this would be indicated by a row that begins with "Table" and has nothing else

In [4]:
import pandas as pd
import boto3
from s3path import S3Path

# Import custom modules
%load_ext autoreload
%autoreload 2
#!python -m spacy download en_core_web_lg
import sys
sys.path.append('../')
from normalize import organize, nlp_proc

In [5]:
# parameters should be UUID and source (Detectron vs. Textract... eventually Google DocumentAI too)
UUID = 'f3d3fe84-a2ca-11eb-9113-666251992ff6'
postproc_output_dir = "s3://tab-data-extraction-sandbox/postproc_output/"

In [6]:
# Read in Multifamily header knowledge base
mf_rr_kb = pd.read_csv('s3://tab-data-extraction-sandbox/manual_review/rr_multifamily_header.csv')

In [7]:
rr_path = S3Path('/dataingest-pdfextraction-output/')
detectron_paths = list(rr_path.glob('detectron_output/' + UUID + '/*.csv'))
textract_paths = list(rr_path.glob('textract_output/' + UUID + '.pdf-analysis/*tables.csv'))

In [8]:
detectron_paths[23]

S3Path('/dataingest-pdfextraction-output/detectron_output/f3d3fe84-a2ca-11eb-9113-666251992ff6/Page 4 - Table 2.csv')

In [9]:
detectron_example = pd.read_csv("s3:/" + str(detectron_paths[23])).iloc[:, 1:] # For detectron, drop first (index) column
textract_example = pd.read_csv("s3:/" + str(textract_paths[0]), skiprows=1, header=None)

In [10]:
'''for p in detectron_paths:
    extraction_postproc(p)
'''
for p in textract_paths:
    print(p)
    extraction_postproc(p)

/dataingest-pdfextraction-output/textract_output/f3d3fe84-a2ca-11eb-9113-666251992ff6.pdf-analysis/page-1-tables.csv
  sim_mat = [[x.similarity(y) for x in to_map_pipe] for y in kb_pipe]
  sim_mat = [[x.similarity(y) for x in to_map_pipe] for y in kb_pipe]
/dataingest-pdfextraction-output/textract_output/f3d3fe84-a2ca-11eb-9113-666251992ff6.pdf-analysis/page-10-tables.csv
  sim_mat = [[x.similarity(y) for x in to_map_pipe] for y in kb_pipe]
  sim_mat = [[x.similarity(y) for x in to_map_pipe] for y in kb_pipe]
/dataingest-pdfextraction-output/textract_output/f3d3fe84-a2ca-11eb-9113-666251992ff6.pdf-analysis/page-11-tables.csv
  sim_mat = [[x.similarity(y) for x in to_map_pipe] for y in kb_pipe]
  sim_mat = [[x.similarity(y) for x in to_map_pipe] for y in kb_pipe]
/dataingest-pdfextraction-output/textract_output/f3d3fe84-a2ca-11eb-9113-666251992ff6.pdf-analysis/page-12-tables.csv
  sim_mat = [[x.similarity(y) for x in to_map_pipe] for y in kb_pipe]
  sim_mat = [[x.similarity(y) for x in 

### For picking between Textract and Detectron, also consider the number of pages with results

### Apply header detection to Detectron and Textract

In [9]:
# Remove extraneous rows
df_detectron = extra_row_remover(detectron_example)
df_textract = extra_row_remover(textract_example)

In [10]:
# Test get_header_start_end method
header_start_end = nlp_proc.header_detector(df_detectron, mf_rr_kb).replace_header()
header_start_end

  sim_mat = [[x.similarity(y) for x in to_map_pipe] for y in kb_pipe]


Unnamed: 0,Unit,Unit Type,Unit Resident Sq Ft,Name\nMarket Rent,Actual Rent,Resident Deposit,Other Move In Deposit,Lease Expiration,Move Out,Balance
0,6315,3698a1,592.00 t2665776,"Keilee Green\n1,086.00",1078.0,629.5,0.00 8/9/2019,8/8/2021,,132.34
1,6316,3698a1,592.00 t2649445,"Rodney Oatts\n1,086.00",1139.0,100.0,0.00 6/21/2019,6/20/2021,,0.0
2,6317,3698a1,592.00 t3732813,"Nicholas Gugliemelli\n1,086.00",1041.0,100.0,0.00 3/11/2019,3/10/2021,,0.0
3,6318,3698b1,"1,169.00 t2647017","Tyeesha Smith\n1,632.00",1459.0,200.0,0.00 5/10/2019,5/9/2021,,0.0
4,7101,3698b2,"1,259.00 t2724209","Alexandre Claro Bitencourt De Sousa\n1,823.00",1521.0,200.0,0.00 4/2/2020,2/1/2021,,0.0
5,7102,3698a3g,737.00 t2666531,"Sylvia Freeman\n1,296.00",1210.0,693.5,0.00 8/16/2019,8/15/2021,,0.0
6,7103,3698a3g,737.00 t2871216,"Mark Woods\n1,296.00",1204.0,100.0,0.00 1/19/2018,2/18/2021,,681.36
7,7104,3698a3g,737.00 t2742020,"Kayode Holbrook\n1,296.00",1273.0,100.0,0.00 9/15/2020,9/14/2021,,0.0
8,7105,3698a2g,826.00 t2738920,"Jonathan Robertson Jr\n1,483.00",1519.0,0.0,0.00 6/18/2020,6/17/2021,,-247.58
9,7106,3698a5g,791.00 t3421286,"William Fulton\n1,408.00",1530.0,100.0,0.00 10/27/2018,10/26/2020,,118.0


In [None]:
df_detectron.head()

In [None]:
_,zscore = nlp_proc.header_detector(df_detectron, mf_rr_kb).get_header_candidates()
z_df = pd.DataFrame(zscore, columns=['idx','v','z'])
z_df['deltaz'] = z_df.z.diff()

In [None]:
z_df

In [None]:
df_detectron.head()

In [None]:
# Test get_header_start_end method
start_end = nlp_proc.header_detector(df_textract, mf_rr_kb).get_header_start_end()
start_end

In [None]:
# Create final data frames with organized header
tmp = nlp_proc.header_detector(df_textract, mf_rr_kb).replace_header()

In [None]:
tmp.head()

In [None]:
tmp.columns

In [None]:
# Test coercion to nuneric or date - if fails set as string, if doens't set data type
num_table = df_detectron.apply(lambda x: organize.numeric_cleaner(x))

In [None]:
# Cut first rows of num_table that are all NaN
num_table = num_table.dropna(how='all')
data_start = num_table.index[0]
# Calculate share of na rows per column
na_share = num_table.isnull().sum() / len(num_table)
na_share = na_share[na_share < 0.75]
'''
edited_table = orig_table.copy()
for col in na_share.index:
    edited_table[[col]] = num_table[[col]]'''

In [None]:
na_share

In [None]:
num_table.index[0]

### Run function across all PDF output and save z_score full with diff to fine tune the delta threshold using data

### Apply numeric-cleaner ($ and - handling)

### TODO: Apply DITTO to table reading 
 1. Read list of manually labelled rent rolls
 2. Apply similarity based method for predicted vs. actual labelling
 3. Leverage misclassified examples via similarity score as negative examples in language model fine tuning
 4. Structure data to send through transformer model (potentially just use DITTO out of the box to begin)