## Routine to extract structured STC fields/paragraphs

In [28]:
import re

def get_block(text, pattern_start, pattern_end):
    try:
        match_start = re.search(pattern_start, text) # re.IGNORECASE
        match_end = re.search(pattern_end, text[match_start.span(0)[1]:]) # re.IGNORECASE

        return text[match_start.span(0)[1]:(match_end.span(0)[0]+match_start.span(0)[1])]
    except:
        return ''

In [29]:
def get_all_descriptions(pdf_pages):
    descriptions = []
    new_line_pattern = '(\n|\n\r|\r|\(| )'

    # First page
    descriptions.append(get_block(pdf_pages[0],
                                  new_line_pattern+'Description of (the )?Type'+new_line_pattern+'Design Change:?\.?'+new_line_pattern,
                                  '(Make\:|Date of |Limitations |[a-z ]{5,10}ions and Cond|\(Description |\(See continuation)'))
    # Other pages
    final_del = 'Any alteration |United States of America|…|- - -|\. \. \. |\* \* \* |END|---|\.\.\.|\*\*\*|Certification Basis|\(See continuation|\(cont'
    for n in range(1, len(pdf_pages)):
        descriptions.append(get_block(pdf_pages[n],
                                      new_line_pattern+'Description of (the )?Type Design Change:?\.? (\([Cc]on[a-z ]+\)|[Cc]on[a-z ]+):?'+new_line_pattern,
                                      new_line_pattern+'(Limitation(s)? and Condition(s)?:?\.? (\(?[Cc]on[a-z \.]+\)?):?'+'|'+final_del+')'))
        
    return descriptions

In [30]:
def get_all_limitations(pdf_pages):
    limitations = []
    new_line_pattern = '(\n|\n\r|\r|\(| )'

    # First page
    limitations.append(get_block(pdf_pages[0],
                                  new_line_pattern+'(Limitations and Conditions|[a-z ]{5,10}ions and Cond[a-z ]{5,10}):?\.?'+new_line_pattern,
                                  '(\(Limitations |'+new_line_pattern+'This certificat|\(See )'))
    # Other pages
    final_del = 'Any alteration |United States of America|…|- - -|\. \. \. |\* \* \* |---|\.\.\.|\*\*\*|Certification Basis|\(See continuation|\(cont'
    for n in range(1, len(pdf_pages)):
        limitations.append(get_block(pdf_pages[n],
                                      new_line_pattern+'Limitation(s)? and Condition(s)?:?\.? (\([Cc]on[a-z \.]+\)|[Cc]on[a-z \.]+|Con[a-z \.]+):?'+new_line_pattern, 
                                      new_line_pattern+"("+final_del+")"))
    return limitations

## Decoding examples

In [20]:
example_path = 'SA2612SW-D__Current__044399BBA0D6080285256CC20008C674'

### On google-extracted text

In [31]:
import os

with open(os.path.join(os.getcwd(),'database','data','stc','text-from-pdf',example_path+'.txt'),"r", encoding="utf8") as my_text:
    pdf_pages = my_text.read().split("\n\n")

    print(get_all_descriptions(pdf_pages))
    print("=======================")
    print(get_all_limitations(pdf_pages))
    # print("=======================")
    # print(pdf_pages)

["1.\nInstall dual wiring provisions in accordance with ARINC 580/599, ECO R2847\nRev. A dated 11/21/77, and ECO R3111 Rev. A dated 6/18/80.\nInstall a single Litton LTN-201 ONS or LTN-211 ONS approved under TSO C94\nin accordance with ECO's R2847 Rev. B thru G dated 3/28/78, 7/28/78,\n9/18/78, 2/16/79, 3/20/79, and 9/26/79 respectively, and in accordance\nwith the general guidelines as outlined in AC120-37. (Continued)\nFEDERAL\n", '3. Install a single or dual Litton LTN-211 ONS/VLF system approved under TSO C94\nin accordance with ECO R2847 Rev. K, N and P dated 8/31/80, 12/19/80, and\n2/16/81, respectively, or ECO R4047 dated 11/25/83 in accordance with the\ngeneral guidelines as outlined in AC 120-37.\nSA2612SW-D\n4. Relocate No. 1 and No. 2 LTN-211 ONS/VLF system antenna in accordance with ECO\nR3230 dated 8/7/81 following the guidelines as outlined in AC 120-37.\n6.\n5. Replace LTN-211 ONS/VLF system containing computer program No. 211-32-02 with\nsystem containing computer progr

### On pdfplumber-extracted text

In [7]:
import pdfplumber

with pdfplumber.open(os.path.join(os.getcwd(),'database','data','stc','pdf',example_path+'.pdf')) as pdf:
    pdf_pages = [str(this_page.extract_text())+'\n' for this_page in pdf.pages] pdf_pages = my_text.read().split("\n\n")

    print(get_all_descriptions(pdf_pages))
    print("=======================")
    print(get_all_limitations(pdf_pages))
    print("=======================")
    print(pdf_pages)


['Install a Continental W670-23 engine in the above model\nairplane in accordance with Olde Thyme Aviation Pictures 1 through 36, and Olde Thyme Aviation\nInstallation Report OTA-3-1-06 dated March 20, 2006.\n']
['Data pertaining to this modification are considered inadequate for\nduplication in other aircraft. This approval is therefore limited to the installation in Waco UKC airplane\nserial number 3978, registration N14611 only. A copy of this certificate, and the Airplane Flight Manual\nSupplement dated March 31, 2006 shall be maintained as part of the permanent records for the\nmodified aircraft.']
['UnitedStatesofAmerica\nDepartmentofTransportation_FederalAviationAdministration\nSupplementalTypeCertificate\nNumber\nSA01661SE\nThis certificate, issued to Olde Thyme Aviation\n21704 141st Ave. SW\nVashon Island, WA 98070\ncertifies that the change in the type design for the following product with the limitations and conditions\ntherefor as specified hereon meets the airworthiness re

## Apply routines for fields/paragraphs on all text

In [7]:
def list2text_frompdf(mylist):
    mytext = ' [SEP] '.join([val for val in mylist if len(val)>2])

    return mytext

In [27]:
decoded_pdf[2:5]

[['0ECF3ED93D565BA98625836A004BAA0C',
  'Aero Fabricators, Inc.\n1216 North Road\nLyons, WI 53148\nInstall Aero Fabricators shoulder harness and seat belt assembly per Aero\nFabricators Installation Instructions AF-44, Revision A, dated January 12, 1993,\nor later FAA approved revisions.\nDate of application: July 2, 1992\nDate of issuance:\nFAA FORM 8110-2 (10-68)\n',
  'This approval should not be extended to other aircraft of this model on\nwhich other previously approved modifications are incorporated, unless it\nis determined by the installer that the interrelationship between this\nchange and any of those other previously approved modifications will\nintroduce no adverse effect upon the airworthiness of that aircraft.\nMake Piper Aircraft Corp.\nModel:',
  'O\nUnited States of America\nDepartment of Transportation-Federal Aviation Administration\nSupplemental Type Certificate\nNumber SA00001CH\nThis certificate, issued to\ncertifies that the change in the type design for the foll

In [41]:
import os
import glob
import pandas as pd
import logging
import pdfplumber
import PyMuPDF

logging.basicConfig(filename='17_stc_text_extractor.log', encoding='utf-8',format='%(asctime)s %(message)s', level=logging.INFO)

for ocr_option in ['googlevision', 'pdfplumber']:
    text_source = 'text-from-pdf-'+ocr_option

    if ocr_option=='pdfplumber':
        available_txts = glob.glob(os.path.join(os.getcwd(),'database','data','stc','pdf','*.pdf'))
    elif ocr_option=='googlevision':
        available_txts = glob.glob(os.path.join(os.getcwd(),'database','data','stc','text-from-pdf','*.txt'))

    decoded_pdf = [[]]

    for path_file in available_txts:
        try:
            if ocr_option=='pdfplumber':
                with pdfplumber.open(path_file) as opened_pdf:    
                    raw_text_pagesplit = [str(this_page.extract_text())+'\n' for this_page in opened_pdf.pages] 
            elif ocr_option=='googlevision':
                with open(path_file, 'r', encoding="utf8") as f:
                    raw_text = f.read()
                    raw_text_pagesplit = raw_text.split("\n\n")

            decoded_pdf.append([path_file.split('__')[-1][:-4],
                                list2text_frompdf(get_all_descriptions(raw_text_pagesplit)),
                                list2text_frompdf(get_all_limitations(raw_text_pagesplit)),
                                "\n\n".join(raw_text_pagesplit)
                                ])

        except Exception as e:
            logging.info('Failed for '+path_file+' with Exception: '+str(e))
            decoded_pdf.append([path_file.split('__')[-1][:-4],
                                '',
                                '',
                                ''])

    df_pdf_content = pd.DataFrame(decoded_pdf,
                                columns=['documentGuid', 
                                        'descriptions', 
                                        'limitations',
                                        'rawtext'])    
    df_pdf_content = df_pdf_content.astype("string")

    df_pdf_content.to_parquet(os.path.join(os.getcwd(),'database','decoded_pdf_using_'+text_source+'.parquet'))


  if _pandas_api.is_sparse(col):
