# Parsr: Jupyter Notebook Demo

This notebook provides a demo showing how one can process a document (pdf or image) using the Parsr pipeline's API interface to generate its various outputs.

## Module Import

In [None]:
from parsr_client import ParsrClient as client
from output_renderer import RenderMarkdown, RenderJSON, RenderHTML
import json
import pandas as pd
import numpy as np

In [None]:
REGEX_TYPE = "regex"

## Initialize the client object

In [None]:
parsr = client('localhost:3001')

## Send document for processing

In [None]:
job = parsr.send_document(
    file='./Invoice_modified.pdf',
    config='./defaultConfig.json',
    document_name='Sample File2',
    wait_till_finished=True,
    save_request_id=True,
)

RenderJSON(job)

## Get the full JSON output

In [None]:
# Careful - it can be long!
# Uncomment the following:

RenderJSON(
   parsr.get_json()
)

In [None]:
def flatten_list_of_arrays(list_of_arrays):
    return [el for item in list_of_arrays for el in item]

In [None]:

json_object = parsr.get_json()
metadata = pd.read_json(json.dumps(json_object['metadata']))

In [None]:
metadata

In [None]:
metadata = metadata[metadata['type'] == REGEX_TYPE]
# metadata = metadata[metadata['data']['name'] == 'Bill amount']
regexp_metadata = pd.DataFrame(list(metadata['data']))
# regexp_metadata = regexp_metadata[regexp_metadata['name'] == 'Bill amount']
regexp_metadata
regexp_metadata.drop(columns=['regex', 'groups'], inplace=True)

metadata.join(regexp_metadata)
metadata.drop(columns=['data'], inplace=True)
metadata

In [None]:
elements_to_inspect = np.array(flatten_list_of_arrays(metadata['elements']))
# [el for element in metadata['elements'] for el in element ]
elements_to_inspect

In [None]:
elements_to_inspect

In [None]:
pages = pd.read_json(json.dumps(json_object['pages']))
pages_element = pd.DataFrame(list(pages['elements']))

In [None]:
page_element = pd.DataFrame(list(pages_element.T.get(0)))
mask = page_element['id'].isin(elements_to_inspect)
page_element


In [None]:
page_element.sort_values('type')

In [None]:
heading_element = page_element[page_element['type'] == 'heading']

In [None]:
# page_element =  page_element[page_element['type'] != 'image']
# page_element.dropna( inplace=True)
page_element = page_element[page_element['content'].notna()]

In [None]:
page_element

In [None]:
heading_element = pd.DataFrame(flatten_list_of_arrays(heading_element['content']))
heading_element


In [None]:
page_element[page_element['id'].isin(list(elements_to_inspect))]


In [None]:
for el in heading_element['content']:
    print(el)

In [None]:
def get_line(array):
    return " ".join([el['content'] for el in array]).strip()

In [None]:
heading_string = heading_element['content'].apply(lambda x: get_line(x))
heading_string = heading_string.rename('string format')
heading_string

In [None]:
heading_element.join(heading_string)