In [62]:
import pandas as pd
import ipywidgets as widgets
import camelot
import re
import warnings
import voila
warnings.filterwarnings("ignore")

# Pulling Indigenous Information from CER Final Reports

Description of app: This app allows users to load a PDF they would like to extract tables from. The app has been specifically designed to focus on tables containing indigenous information in the CER final reports. It can still work for other types of tables, but it isn't guarenteed it will work properly.

## Caveats

The app was created with the consideration that all CER final reports are the same. If the structure changes, it may not extract the tables as well.

## Uploading a PDF of the CER Final Report

Please upload the PDF you are currently interested in. Afterwards, you will be asked to fill in some information and then the table you are looking at. You can only extract one table at a time. Give the page numbers of the table and it will be extracted.

In [63]:
btn_upload = widgets.FileUpload(accept='.pdf', multiple=True)
display(btn_upload)

FileUpload(value={}, accept='.pdf', description='Upload', multiple=True)

In [64]:
output_extract_table = widgets.Output()

In [65]:
def input_pages():
    global pages_str
    global pages_iter
    pages_widget = widgets.Text(
        value='ex: 131, 132, 133, 134',
        placeholder='Type pages to extract here!',
        description='Pages: ',
        disabled=False
    )
    display(pages_widget)

    def pages_callback(wdgt):
        global pages_str
        global pages_iter
        
        pages_str = str(wdgt.value)
        display(pages_str)
        pages_iter = [int(item) for item in pages_str.split(',')]

    pages_widget.on_submit(pages_callback)
    
def input_section():
    global section_name
    section_widget = widgets.Text(
        value='ex: 7.3 Environmental Issues Raised by Participants',
        placeholder='Type section name here!',
        description='Section: ',
        disabled=False
    )
    display(section_widget)

    def section_callback(wdgt):
        global section_name
        section_name = str(wdgt.value)
        display(section_name)

    section_widget.on_submit(section_callback)
    
def input_appendix():
    global appendix_name
    appendix_widget = widgets.Text(
        value='ex: Appendix II - Aboriginal Groups that Participated in the OH-02-2015 Proceeding and the Form of their Participation',
        placeholder='Type appendix name here!',
        description='Appendix: ',
        disabled=False
    )
    display(appendix_widget)

    def appendix_callback(wdgt):
        global appendix_name
        appendix_name = str(wdgt.value)
        display(appendix_name)

    appendix_widget.on_submit(appendix_callback)

In [72]:
def extract_table(b):
    global df
    global pages_iter
    global name_of_document
    with output_extract_table:
        print('start extracting')        

    [uploaded_file] = btn_upload.value
    file_path = uploaded_file
    
    with output_extract_table:
        print('file path:', file_path)

#     for name, file_info in btn_upload.value.items():
#         with open (name, 'wb') as file:
#             file.write(file_info['content'])

    name_pattern = '([^//]+)$'
    name_of_document = re.findall(name_pattern, file_path)

    with output_extract_table:
        print('pages:', pages_str)
    
    ## Pulling tables with Camelot

    tables = camelot.read_pdf(uploaded_file, copy_text=['v'], strip_text = '\n', line_scale=40, f = 'excel', flavour = 'lattice', pages=str(pages_str))
#     print(tables[0].df)
#     print(tables[1].df)

    with output_extract_table:
        print('tables:', len(tables))

    for i, page in enumerate(pages_iter):
#         print(page)
        tables[i].df.rename(columns=tables[i].df.iloc[0], inplace = True)
        tables[i].df.drop([0], inplace = True)
        tables[i].df['page_number'] = page
        tables[i].df['name_of_document'] = name_of_document[0][9:-18]
        # NEB_-_Report_-_Enbridge_-_Line_3_Replacement_Detailed_Assessment
        tables[i].df['date_of_document'] = file_path[-8:-4]
        tables[i].df['name_of_section'] = section_name
        tables[i].df['name_of_appendix'] = appendix_name

    with output_extract_table:
        print('Finished loop')
    
    df = tables[0].df.append([tables[x].df for x in range(1, len(tables))]).reset_index(drop=True)
    
    with output_extract_table:
        print('df.shape:', df.shape)

In [73]:
def input_multi_point_col():
    global multi_point_col_name
    multi_point_col_widget = widgets.Text(
        value='ex: Environmental Issue(s) Raised',
        placeholder='Type column name here!',
        #What is the exact column title for the column with multiple information points?
        description='Column: ',
        disabled=False
    )
    display(multi_point_col_widget)

    def multi_point_col_callback(wdgt):
        global multi_point_col_name
        multi_point_col_name = str(wdgt.value)
        display(multi_point_col_name)

    multi_point_col_widget.on_submit(multi_point_col_callback)

In [74]:
def seperate_bullets_button():
    with output_extract_table:
        print('Separating column with multiple points')
    
    def separate_bullets_callback(wdgt):
        s = df[multi_point_col_name].str.split('  ').apply(pd.Series, 1).stack()
        s.index = s.index.droplevel(-1)
        s.name = multi_point_col_name

        del df[multi_point_col_name]
        df = df.join(s)

        df[multi_point_col_name] = df[multi_point_col_name].str.replace('', '')
        df[multi_point_col_name] = df[multi_point_col_name].str.replace('  ', '')
        df.drop(df.columns[[0]], axis=1, inplace=True)

    multi_point_col_widget.on_submit(multi_point_col_callback)

In [81]:
def save_table_to_excel(b):
    df.to_excel(name_of_document[0][9:-18] + '_' + str(pages_iter[0]) + '_' + str(pages_iter[-1]) + '.xlsx', index=False)

In [76]:
input_pages()
input_section()
input_appendix()

Text(value='ex: 131, 132, 133, 134', description='Pages: ', placeholder='Type pages to extract here!')

Text(value='ex: 7.3 Environmental Issues Raised by Participants', description='Section: ', placeholder='Type s…

Text(value='ex: Appendix II - Aboriginal Groups that Participated in the OH-02-2015 Proceeding and the Form of…

'131, 132, 133, 134'

'7.3 Environmental Issues Raised by Participants'

'Appendix II - Aboriginal Groups that Participated in the OH-02-2015 Proceeding and the Form of their Participation'

In [82]:
extract_button = widgets.Button(description="Extract Table")
display(extract_button)
display(output_extract_table)

extract_button.on_click(extract_table)

save_button = widgets.Button(description="Save Table")
display(save_button)
save_button.on_click(save_table_to_excel)

Button(description='Extract Table', style=ButtonStyle())

Output(outputs=({'output_type': 'stream', 'text': 'start extracting\nfile path: A76575-3_NEB_-_Report_-_Enbrid…

Button(description='Save Table', style=ButtonStyle())

In [None]:
# # A button example
# from IPython.display import display
# button = widgets.Button(description="Click Me!")
# output = widgets.Output()

# display(button, output)

# def on_button_clicked(b):
#     with output:
#         print("Button clicked.")

# button.on_click(on_button_clicked)