In [2]:
import pandas as pd
import ipywidgets as widgets
import camelot
import re
import warnings
import voila
warnings.filterwarnings("ignore")

# Pulling Indigenous Information from CER Final Reports

Description of app: This app allows users to load a PDF they would like to extract tables from. The app has been specifically designed to focus on tables containing indigenous information in the CER final reports. It can still work for other types of tables, but it isn't guarenteed it will work properly.

## Caveats

The app was created with the consideration that all CER final reports are the same. If the structure changes, it may not extract the tables as well.

## Uploading a PDF of the CER Final Report

Please upload the PDF you are currently interested in. Afterwards, you will be asked to fill in some information and then the table you are looking at. You can only extract one table at a time. Give the page numbers of the table and it will be extracted.

In [3]:
btn_upload = widgets.FileUpload(accept='.pdf', multiple=True)
display(btn_upload)

FileUpload(value={}, accept='.pdf', description='Upload', multiple=True)

In [17]:
def extract_table(btn_upload):
    [uploaded_file] = btn_upload.value
    file_path = uploaded_file

    for name, file_info in btn_upload.value.items():
        with open (name, 'wb') as file:
            file.write(file_info['content'])
            
    name_pattern = '([^//]+)$'
    name_of_document = re.findall(name_pattern, file_path)
    
    pages_iter = [int(item) for item in input("Enter the list of pages separated by space (e.g. 129 130 131): ").split(',')]
    pages = str(str(pages_iter).strip('[]').replace("'",""))
#     print(pages_iter)
#     print(pages_iter[-1])
    
    name_of_section = input("Enter section name: ")
    name_of_appendix = input("Enter appendix name: ")

    tables = camelot.read_pdf(uploaded_file, copy_text=['v'], strip_text = '\n', line_scale=40, f = 'excel', flavour = 'lattice', pages=str(pages))
#     print(tables[0].df)
#     print(tables[1].df)
    
    for i, page in enumerate(pages_iter):
#         print(page)
        tables[i].df.rename(columns=tables[i].df.iloc[0], inplace = True)
        tables[i].df.drop([0], inplace = True)
        tables[i].df['page_number'] = page
        tables[i].df['name_of_document'] = name_of_document[0][9:-18]
        # NEB_-_Report_-_Enbridge_-_Line_3_Replacement_Detailed_Assessment
        tables[i].df['date_of_document'] = file_path[-8:-4]
        tables[i].df['name_of_section'] = name_of_section
        tables[i].df['name_of_appendix'] = name_of_appendix
        
    df = tables[0].df.append([tables[x].df for x in range(1, len(tables))]).reset_index(drop=True)
    
    multi_point_col_name = input("What is the exact column title for the column with multiple information points?")
    
    s = df[multi_point_col_name].str.split('  ').apply(pd.Series, 1).stack()
    s.index = s.index.droplevel(-1)
    s.name = multi_point_col_name
    
    del df[multi_point_col_name]
    df = df.join(s)
    
    df[multi_point_col_name] = df[multi_point_col_name].str.replace('', '')
    df[multi_point_col_name] = df[multi_point_col_name].str.replace('  ', '')
    df.drop(df.columns[[0]], axis=1, inplace=True)
#     print(df)
    df.to_excel(name_of_document[0][9:-18] + '_' + str(pages_iter[0]) + '_' + str(pages_iter[-1]) + '.xlsx', index=False)


In [18]:
extract_button = widgets.Button(description="Extract Table")
extract_button.on_click(extract_table(btn_upload))

Enter the list of pages separated by space (e.g. 129 130 131): 131,132,133,134
[131, 132, 133, 134]
134
Enter section name: 
Enter appendix name: 
131
132
133
134
What is the exact column title for the column with multiple information points?Environmental Issue(s) Raised
                               Participant date_of_document date_of_section  \
0                              Intervenors             2015                   
1   Asini Wachi Nehiyawak Traditional Band             2015                   
1   Asini Wachi Nehiyawak Traditional Band             2015                   
1   Asini Wachi Nehiyawak Traditional Band             2015                   
1   Asini Wachi Nehiyawak Traditional Band             2015                   
..                                     ...              ...             ...   
27                  Onion Lake Cree Nation             2015                   
28                                     NaN              NaN             NaN   
29               