# Bill info extractor

This notebook provides a demo showing how one can process a document (pdf or image) using the Parsr pipeline's API interface to generate its various outputs.

## Module Import

In [1]:
from parsr_client import ParsrClient as client
from output_renderer import RenderMarkdown, RenderJSON, RenderHTML
import json
import pandas as pd
import numpy as np

In [2]:
REGEX_TYPE = "regex"

## Initialize the client object

In [3]:
parsr = client('localhost:3001')

## Send document for processing

In [4]:
job = parsr.send_document(
    file='./examples/Invoice_modified.pdf',
    config='./defaultConfig.json',
    document_name='Sample File2',
    wait_till_finished=True,
    save_request_id=True,
)

RenderJSON(job)

> Polling server for the job 1f711b9dd961d6d94592464e290a59...
>> Job done!
{
    [94m"config"[39;49;00m: [33m"./defaultConfig.json"[39;49;00m,
    [94m"file"[39;49;00m: [33m"./examples/Invoice_modified.pdf"[39;49;00m,
    [94m"server_response"[39;49;00m: [33m"1f711b9dd961d6d94592464e290a59"[39;49;00m,
    [94m"status_code"[39;49;00m: [34m202[39;49;00m
}



<output_renderer.RenderJSON at 0x7fbdc46b6e20>

## Get the full JSON output

In [None]:
# Careful - it can be long!
# Uncomment the following:

# RenderJSON(
#    parsr.get_json()
# )

In [6]:
def flatten_list_of_arrays(list_of_arrays):
    return [el for item in list_of_arrays for el in item]

In [7]:

json_object = parsr.get_json()
metadata = pd.read_json(json.dumps(json_object['metadata']))

In [8]:
metadata

Unnamed: 0,id,elements,type,data
0,1,[8],regex,"{'name': 'Facture', 'regex': 'facture', 'fullM..."
1,2,[123],regex,"{'name': 'Postal code', 'regex': '\d{5}', 'ful..."
2,3,[123],regex,"{'name': 'Postal code', 'regex': '\d{5}', 'ful..."
3,4,[291],regex,"{'name': 'Postal code', 'regex': '\d{5}', 'ful..."
4,5,[151],regex,"{'name': 'Facture', 'regex': 'facture', 'fullM..."
5,6,[197],regex,"{'name': 'Facture', 'regex': 'facture', 'fullM..."
6,7,[225],regex,"{'name': 'Postal code', 'regex': '\d{5}', 'ful..."
7,8,[225],regex,"{'name': 'Postal code', 'regex': '\d{5}', 'ful..."
8,9,"[245, 246]",regex,"{'name': 'Bill amount', 'regex': '\d+([\.,]\d{..."
9,10,[484],regex,"{'name': 'Postal code', 'regex': '\d{5}', 'ful..."


In [9]:
metadata = metadata[metadata['type'] == REGEX_TYPE]
# metadata = metadata[metadata['data']['name'] == 'Bill amount']
regexp_metadata = pd.DataFrame(list(metadata['data']))
# regexp_metadata = regexp_metadata[regexp_metadata['name'] == 'Bill amount']
regexp_metadata
regexp_metadata.drop(columns=['regex', 'groups'], inplace=True)

metadata.join(regexp_metadata)
metadata.drop(columns=['data'], inplace=True)
metadata

Unnamed: 0,id,elements,type
0,1,[8],regex
1,2,[123],regex
2,3,[123],regex
3,4,[291],regex
4,5,[151],regex
5,6,[197],regex
6,7,[225],regex
7,8,[225],regex
8,9,"[245, 246]",regex
9,10,[484],regex


In [10]:
elements_to_inspect = np.array(flatten_list_of_arrays(metadata['elements']))
# [el for element in metadata['elements'] for el in element ]
elements_to_inspect

array([   8,  123,  123,  291,  151,  197,  225,  225,  245,  246,  484,
        532,  701,  722,  722,  825,  825,  847, 1159,  998,  999, 1003,
       1004, 1010, 1011, 1017, 1018, 1172, 1179, 1180, 1206, 1207, 1213,
       1214, 1220, 1221, 1233, 1234, 1240, 1241])

In [11]:
elements_to_inspect

array([   8,  123,  123,  291,  151,  197,  225,  225,  245,  246,  484,
        532,  701,  722,  722,  825,  825,  847, 1159,  998,  999, 1003,
       1004, 1010, 1011, 1017, 1018, 1172, 1179, 1180, 1206, 1207, 1213,
       1214, 1220, 1221, 1233, 1234, 1240, 1241])

In [12]:
pages = pd.read_json(json.dumps(json_object['pages']))
pages_element = pd.DataFrame(list(pages['elements']))

In [13]:
page_element = pd.DataFrame(list(pages_element.T.get(0)))
mask = page_element['id'].isin(elements_to_inspect)
page_element


Unnamed: 0,id,type,properties,metadata,box,src,refId,xObjId,xObjExt,content,level
0,1254,image,"{'order': 0, 'cr': 574.38, 'cl': 30.61}",[],"{'l': 30.61, 't': 20.21, 'w': 47.96, 'h': 28.77}",/tmp/274f007f7b0ce68b4c016414409f72/img-0010.jpg,Im4,10.0,jpg,,
1,4577,heading,{'order': 1},[],"{'l': 516.39, 't': 29.26, 'w': 53.92, 'h': 14.29}",,,,,"[{'id': 4433, 'type': 'line', 'properties': {'...",1.0
2,4559,paragraph,{'order': 2},[],"{'l': 342, 't': 73.24, 'w': 217.16, 'h': 52.31}",,,,,"[{'id': 4434, 'type': 'line', 'properties': {'...",
3,4560,paragraph,{'order': 6},[],"{'l': 62.7, 't': 150.99, 'w': 90.82, 'h': 36.03}",,,,,"[{'id': 4444, 'type': 'line', 'properties': {'...",
4,4575,heading,{'order': 9},[],"{'l': 62.7, 't': 191.5, 'w': 11.46, 'h': 9.01}",,,,,"[{'id': 4460, 'type': 'line', 'properties': {'...",1.0
5,4561,paragraph,{'order': 10},[],"{'l': 344.3, 't': 145.76, 'w': 215.92, 'h': 45...",,,,,"[{'id': 4442, 'type': 'line', 'properties': {'...",
6,4574,paragraph,{'order': 17},[],"{'l': 34.41, 't': 230.55, 'w': 337.74, 'h': 7.51}",,,,,"[{'id': 4462, 'type': 'line', 'properties': {'...",
7,4562,paragraph,{'order': 18},[],"{'l': 35.09, 't': 261.79, 'w': 110.71, 'h': 49...",,,,,"[{'id': 4466, 'type': 'line', 'properties': {'...",
8,4563,paragraph,{'order': 22},[],"{'l': 210.61, 't': 261.79, 'w': 98.42, 'h': 49...",,,,,"[{'id': 4467, 'type': 'line', 'properties': {'...",
9,4564,paragraph,{'order': 26},[],"{'l': 35.09, 't': 261.79, 'w': 539.29, 'h': 90...",,,,,"[{'id': 4468, 'type': 'line', 'properties': {'...",


In [14]:
page_element.sort_values('type')

Unnamed: 0,id,type,properties,metadata,box,src,refId,xObjId,xObjExt,content,level
1,4577,heading,{'order': 1},[],"{'l': 516.39, 't': 29.26, 'w': 53.92, 'h': 14.29}",,,,,"[{'id': 4433, 'type': 'line', 'properties': {'...",1.0
4,4575,heading,{'order': 9},[],"{'l': 62.7, 't': 191.5, 'w': 11.46, 'h': 9.01}",,,,,"[{'id': 4460, 'type': 'line', 'properties': {'...",1.0
14,4576,heading,{'order': 48},[],"{'l': 298.4, 't': 566.95, 'w': 266.45, 'h': 11...",,,,,"[{'id': 4523, 'type': 'line', 'properties': {'...",3.0
0,1254,image,"{'order': 0, 'cr': 574.38, 'cl': 30.61}",[],"{'l': 30.61, 't': 20.21, 'w': 47.96, 'h': 28.77}",/tmp/274f007f7b0ce68b4c016414409f72/img-0010.jpg,Im4,10.0,jpg,,
17,4569,paragraph,{'order': 53},[],"{'l': 470.21, 't': 616.86, 'w': 90.05, 'h': 7.51}",,,,,"[{'id': 4531, 'type': 'line', 'properties': {'...",
16,4568,paragraph,{'order': 52},[],"{'l': 301.41, 't': 638.65, 'w': 18.99, 'h': 7.51}",,,,,"[{'id': 4536, 'type': 'line', 'properties': {'...",
15,4573,paragraph,{'order': 50},[],"{'l': 373.21, 't': 595.85, 'w': 184.24, 'h': 2...",,,,,"[{'id': 4526, 'type': 'line', 'properties': {'...",
13,4567,paragraph,{'order': 46},[],"{'l': 369.41, 't': 506.64, 'w': 190.85, 'h': 8...",,,,,"[{'id': 4513, 'type': 'line', 'properties': {'...",
12,4566,paragraph,{'order': 39},[],"{'l': 38.1, 't': 475.15, 'w': 519.31, 'h': 79.51}",,,,,"[{'id': 4505, 'type': 'line', 'properties': {'...",
11,4572,paragraph,{'order': 38},[],"{'l': 38.1, 't': 451.16, 'w': 114.48, 'h': 11.31}",,,,,"[{'id': 4502, 'type': 'line', 'properties': {'...",


In [15]:
heading_element = page_element[page_element['type'] == 'heading']

In [16]:
# page_element =  page_element[page_element['type'] != 'image']
# page_element.dropna( inplace=True)
page_element = page_element[page_element['content'].notna()]

In [17]:
page_element

Unnamed: 0,id,type,properties,metadata,box,src,refId,xObjId,xObjExt,content,level
1,4577,heading,{'order': 1},[],"{'l': 516.39, 't': 29.26, 'w': 53.92, 'h': 14.29}",,,,,"[{'id': 4433, 'type': 'line', 'properties': {'...",1.0
2,4559,paragraph,{'order': 2},[],"{'l': 342, 't': 73.24, 'w': 217.16, 'h': 52.31}",,,,,"[{'id': 4434, 'type': 'line', 'properties': {'...",
3,4560,paragraph,{'order': 6},[],"{'l': 62.7, 't': 150.99, 'w': 90.82, 'h': 36.03}",,,,,"[{'id': 4444, 'type': 'line', 'properties': {'...",
4,4575,heading,{'order': 9},[],"{'l': 62.7, 't': 191.5, 'w': 11.46, 'h': 9.01}",,,,,"[{'id': 4460, 'type': 'line', 'properties': {'...",1.0
5,4561,paragraph,{'order': 10},[],"{'l': 344.3, 't': 145.76, 'w': 215.92, 'h': 45...",,,,,"[{'id': 4442, 'type': 'line', 'properties': {'...",
6,4574,paragraph,{'order': 17},[],"{'l': 34.41, 't': 230.55, 'w': 337.74, 'h': 7.51}",,,,,"[{'id': 4462, 'type': 'line', 'properties': {'...",
7,4562,paragraph,{'order': 18},[],"{'l': 35.09, 't': 261.79, 'w': 110.71, 'h': 49...",,,,,"[{'id': 4466, 'type': 'line', 'properties': {'...",
8,4563,paragraph,{'order': 22},[],"{'l': 210.61, 't': 261.79, 'w': 98.42, 'h': 49...",,,,,"[{'id': 4467, 'type': 'line', 'properties': {'...",
9,4564,paragraph,{'order': 26},[],"{'l': 35.09, 't': 261.79, 'w': 539.29, 'h': 90...",,,,,"[{'id': 4468, 'type': 'line', 'properties': {'...",
10,4565,paragraph,{'order': 33},[],"{'l': 34.41, 't': 379.71, 'w': 206.28, 'h': 38...",,,,,"[{'id': 4493, 'type': 'line', 'properties': {'...",


In [18]:
heading_element = pd.DataFrame(flatten_list_of_arrays(heading_element['content']))
heading_element


Unnamed: 0,id,type,properties,metadata,box,content
0,4433,line,"{'order': 1, 'cr': 574.38, 'cl': 30.61}",[],"{'l': 516.39, 't': 29.26, 'w': 53.92, 'h': 14.29}","[{'id': 8, 'type': 'word', 'properties': {'ord..."
1,4460,line,"{'order': 9, 'cr': 153.52, 'cl': 62.7}",[],"{'l': 62.7, 't': 191.5, 'w': 11.46, 'h': 9.01}","[{'id': 294, 'type': 'word', 'properties': {'o..."
2,4523,line,"{'order': 48, 'cr': 574.38, 'cl': 30.61}",[],"{'l': 298.4, 't': 566.95, 'w': 74.91, 'h': 11.31}","[{'id': 1172, 'type': 'word', 'properties': {'..."
3,4524,line,"{'order': 49, 'cr': 574.38, 'cl': 30.61}",[],"{'l': 528.89, 't': 566.95, 'w': 35.97, 'h': 11...","[{'id': 1179, 'type': 'word', 'properties': {'..."


In [19]:
page_element[page_element['id'].isin(list(elements_to_inspect))]


Unnamed: 0,id,type,properties,metadata,box,src,refId,xObjId,xObjExt,content,level


In [20]:
for el in heading_element['content']:
    print(el)

[{'id': 8, 'type': 'word', 'properties': {'order': 0}, 'metadata': [1], 'box': {'l': 516.39, 't': 29.26, 'w': 53.92, 'h': 14.29}, 'content': 'Facture', 'font': 1, 'fontSize': 14.287}]
[{'id': 294, 'type': 'word', 'properties': {'order': 45}, 'metadata': [], 'box': {'l': 62.7, 't': 191.5, 'w': 11.46, 'h': 9.01}, 'content': 'FR', 'font': 4, 'fontSize': 9.014}]
[{'id': 1172, 'type': 'word', 'properties': {'order': 185}, 'metadata': [23], 'box': {'l': 298.4, 't': 566.95, 'w': 42.95, 'h': 11.31}, 'content': 'Facture', 'font': 2, 'fontSize': 11.31}, {'id': 1173, 'type': 'word', 'properties': {'order': 186}, 'metadata': [], 'box': {'l': 344.99, 't': 566.95, 'w': 28.32, 'h': 11.31}, 'content': 'Total', 'font': 2, 'fontSize': 11.31}]
[{'id': 1179, 'type': 'word', 'properties': {'order': 187}, 'metadata': [24], 'box': {'l': 528.89, 't': 566.95, 'w': 25.18, 'h': 11.31}, 'content': '5,99', 'font': 2, 'fontSize': 11.31}, {'id': 1180, 'type': 'word', 'properties': {'order': 188}, 'metadata': [24], '

In [21]:
def get_line(array):
    return " ".join([el['content'] for el in array]).strip()

In [22]:
heading_string = heading_element['content'].apply(lambda x: get_line(x))
heading_string = heading_string.rename('string format')
heading_string

0          Facture
1               FR
2    Facture Total
3           5,99 €
Name: string format, dtype: object

In [23]:
heading_element.join(heading_string)

Unnamed: 0,id,type,properties,metadata,box,content,string format
0,4433,line,"{'order': 1, 'cr': 574.38, 'cl': 30.61}",[],"{'l': 516.39, 't': 29.26, 'w': 53.92, 'h': 14.29}","[{'id': 8, 'type': 'word', 'properties': {'ord...",Facture
1,4460,line,"{'order': 9, 'cr': 153.52, 'cl': 62.7}",[],"{'l': 62.7, 't': 191.5, 'w': 11.46, 'h': 9.01}","[{'id': 294, 'type': 'word', 'properties': {'o...",FR
2,4523,line,"{'order': 48, 'cr': 574.38, 'cl': 30.61}",[],"{'l': 298.4, 't': 566.95, 'w': 74.91, 'h': 11.31}","[{'id': 1172, 'type': 'word', 'properties': {'...",Facture Total
3,4524,line,"{'order': 49, 'cr': 574.38, 'cl': 30.61}",[],"{'l': 528.89, 't': 566.95, 'w': 35.97, 'h': 11...","[{'id': 1179, 'type': 'word', 'properties': {'...","5,99 €"
