### Look-up table from FAA DRS

In [1]:
import pandas as pd
import numpy as np
import os

metadata_mapping = pd.read_excel(os.path.join(os.getcwd(),'data','DRS','index','FAA DRS','DRS Document Types Metadata Mapping.xlsx')) # from https://drs.faa.gov/help/helpdetails and https://drs.faa.gov/help/helpdetails

doc_types = pd.unique(metadata_mapping.loc[:, "Document Type Name in API request"])
doc_types = [val for val in doc_types if isinstance(val, str)]

common_fields = []
for n,doc_type in enumerate(doc_types):
    metadata_names = metadata_mapping.loc[metadata_mapping.loc[:, "Document Type Name in API request"]==doc_type, "Metadata Name in API Response "]

    if n>0:
        common_fields = list(set(metadata_names) & set(common_fields))
    else:
        common_fields = list(set(metadata_names))

### Collect indexed documents from DRS
See below to collec the index

In [2]:
# doc_types = [val for val in doc_types if val not in ['PMA','TSOI']]
#doc_types = [val for val in doc_types if val!='MMEL']

#doc_types = ['MMEL']
# doc_types = ['TSO']

In [3]:
import os
import logging
import glob
import pandas as pd
import random
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
import time

logging.basicConfig(filename='90_gendoc_collection.log', encoding='utf-8',format='%(asctime)s %(message)s', level=logging.INFO)

tempo = 0.5
firefox_download_folder = r"C:\Users\victor\Downloads"
random.shuffle(doc_types)

options = Options()
options.set_preference("browser.download.panel.shown", False)
options.set_preference("browser.download.manager.showWhenStarting", False)

random.shuffle(doc_types)

def dl_pdf(tempo, driver, firefox_download_folder, doc_name_from_site):
    time_counter = 0
    while driver.find_element(By.CSS_SELECTOR, "#loadingBar").get_attribute("class")!="hidden" and time_counter<60:
        time.sleep(tempo)
        time_counter = time_counter + 1
    driver.find_element(By.CSS_SELECTOR, "#download").click()

    time_counter = 0      
    while (not os.path.exists(os.path.join(firefox_download_folder, doc_name_from_site))) and time_counter<60:
        time.sleep(tempo)
        time_counter = time_counter + 1
    time.sleep(tempo)


def rename_dled_doc(doc_name_from_site, firefox_download_folder, filename, dl_dir, suffix='', file_extension=".pdf"):
    list_of_files = glob.glob(os.path.join(firefox_download_folder,"*"+file_extension))
    latest_file = sorted(list_of_files, key=os.path.getctime, reverse=True)[:10]

    tempo_download = 0
    while tempo_download<10:
        if (doc_name_from_site in [os.path.basename(val) for val in latest_file]):
            if not os.path.exists(os.path.join(dl_dir, filename+suffix+file_extension)):
                os.rename(os.path.join(firefox_download_folder, doc_name_from_site), os.path.join(dl_dir, filename+suffix+file_extension))
            else:
                os.remove(os.path.join(firefox_download_folder, doc_name_from_site))
        tempo_download = tempo_download + 1


def close_pdf_window(driver, tempo):
    time.sleep(tempo)
    driver.switch_to.window(driver.window_handles[1])
    driver.close()
    driver.switch_to.window(driver.window_handles[0])
    time.sleep(tempo)


for doc_type in doc_types:
    logging.info("Start collecting "+doc_type)

    dl_dir = os.path.join(os.getcwd(), 'data', 'DRS', 'raw data', doc_type, 'pdf')
    options.set_preference("browser.download.dir", dl_dir)

    Document_Type_name_in_DRS = metadata_mapping[metadata_mapping["Document Type Name in API request"]==doc_type]["Document Type name in DRS "].unique()[0]
    Metadata_Names_in_DRS =  metadata_mapping[metadata_mapping["Document Type Name in API request"]==doc_type]["Metadata Name in DRS"]

    if len(glob.glob(os.path.join(os.getcwd(), 'data', 'DRS', 'index', doc_type+'_202*.parquet')))>0:
        logging.info("DRS PDF collector no optimization for "+doc_type)

        df_drs = pd.read_parquet(glob.glob(os.path.join(os.getcwd(), 'data', 'DRS', 'index', doc_type+'_202*.parquet'))[-1])

        driver = webdriver.Firefox(options=options)
        driver.maximize_window()

        for index, row in df_drs.sample(frac=1).iterrows():
            # DEBUG
            # row['documentURL'],row['drs:status'],row['documentGuid']='https://drs.faa.gov/browse/excelExternalWindow/F4A842936070975C862578B100680F1C.0001?modalOpened=true','Current','F4A842936070975C862578B100680F1C'
            # doc_type = 'STC'
            # Document_Type_name_in_DRS = 'Supplemental Type Certificates (STC)'
            # dl_dir = r'C:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\DRS\raw data\STC\pdf'

            # 'An error occurred while loading the PDF'

            # List<WebElement> list = driver.findElements(By.xpath("//*[contains(text(),'" + text + "')]"));
            # Assert.assertTrue("Text not found!", list.size() > 0);

            # END DEBUG

            filename = row['drs:status']+"__"+row['documentGuid']

            main_doc_downloaded = (os.path.exists(os.path.join(dl_dir, row['drs:status']+"__"+row['documentGuid']+".pdf"))) or (os.path.exists(os.path.join(dl_dir, row['drs:status']+"__"+row['documentGuid']+".txt")))
            # Check if file already dowloaded. We only test for the first attachment (generaly html or doc file) since the pdf is always listed last (__1 or __2) and from the beginning I've started with pdf only.\\
            attachments_downloaded = (len(glob.glob(os.path.join(dl_dir, row['drs:status']+"__"+row['documentGuid']+"__*__0.pdf")))>0) or \
                                    (len(glob.glob(os.path.join(dl_dir, row['drs:status']+"__"+row['documentGuid']+"__*__-1.txt")))>0)

            if (not main_doc_downloaded) or (not attachments_downloaded):
                try:
                    url, driver, doc_type, Document_Type_name_in_DRS = row['documentURL']+"?modalOpened=true", driver, doc_type, Document_Type_name_in_DRS

                    driver.get(url)
                    driver.switch_to.window(driver.window_handles[0])

                    try:        
                        WebDriverWait(driver, 30).until(
                            EC.presence_of_element_located((By.CSS_SELECTOR, '[title*="'+Document_Type_name_in_DRS+'"]'))
                        )

                        is_text_only = len(driver.find_elements(By.ID, "printArea"))>0

                        if is_text_only:
                            with open(os.path.join(dl_dir, filename+'.txt'), 'w') as f:
                                f.write(driver.find_element(By.ID, "printArea").text)
                            with open(os.path.join(dl_dir, filename+'__None__-1.txt'), 'w') as f:
                                f.write('')
                        else:
                            has_documents = len(driver.find_elements(By.ID, "viewer"))>0
                            if has_documents:
                                WebDriverWait(driver, 30).until(
                                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#download"))
                                )
                                # Main file when no attachment
                                if not main_doc_downloaded:
                                    doc_name_from_site = driver.find_element(By.CLASS_NAME, "doc-view").find_element(By.CLASS_NAME, "ng-star-inserted").text
                                    if doc_name_from_site[0:2]=="- ":
                                        doc_name_from_site = doc_name_from_site[2:]

                                    dl_pdf(tempo, driver, firefox_download_folder, doc_name_from_site)
                                    rename_dled_doc(doc_name_from_site, firefox_download_folder, filename, dl_dir)
                                    close_pdf_window(driver, tempo)

                                # Attachments (contains main file also)
                                if not attachments_downloaded:
                                    has_attachments = len(driver.find_elements(By.CSS_SELECTOR, "[aria-label='Attachments/Public Comments']"))>0
                                    if has_attachments:
                                        driver.find_element(By.CSS_SELECTOR, "[aria-label='Attachments/Public Comments']").click()
                                        nb_of_attachments = len(driver.find_elements(By.CLASS_NAME, "attachment-links"))

                                        for n in range(0, nb_of_attachments):
                                            driver.find_elements(By.CLASS_NAME, "attachment-links")[n].click()
                                            driver.find_element(By.CSS_SELECTOR, "[aria-label='Attachments/Public Comments']").click()

                                            doc_name_from_site = driver.find_elements(By.CLASS_NAME, "attachment-links")[n].text
                                            doc_name_from_site_extention = driver.find_elements(By.CLASS_NAME, "attachment-links")[n].text.split(".")[-1].lower()
                                            suffix = "__" + doc_name_from_site + "__" + str(n)       
                                            attachment_downloaded = os.path.exists(os.path.join(dl_dir, filename+suffix+"."+doc_name_from_site_extention))   
                                            is_nopreview_onlydownload = len(driver.find_elements(By.XPATH, "//*[contains(text(), 'Download')]"))>0

                                            if not attachment_downloaded:
                                                if doc_name_from_site_extention=='pdf':                                                    
                                                    dl_pdf(tempo, driver, firefox_download_folder, doc_name_from_site)
                                                    rename_dled_doc(doc_name_from_site, firefox_download_folder, filename, dl_dir, suffix, "."+doc_name_from_site_extention)
                                                    close_pdf_window(driver, tempo)

                                                elif doc_name_from_site_extention in ['htm', 'html']:
                                                    with open(os.path.join(dl_dir, filename+suffix+"."+doc_name_from_site_extention), 'w') as f:
                                                        f.write(driver.find_element(By.ID, "printArea").text)

                                                elif is_nopreview_onlydownload:
                                                    driver.find_elements(By.XPATH, "//*[contains(text(), 'Download')]")[0].click()
                                                                                                
                                                    rename_dled_doc(doc_name_from_site, firefox_download_folder, filename, dl_dir, suffix, "."+doc_name_from_site_extention)

                                    else: # no attachment
                                        with open(os.path.join(dl_dir, filename+'__None__-1.txt'), 'w') as f:
                                            f.write('')
                            else: # no documents
                                with open(os.path.join(dl_dir, filename+'.txt'), 'w') as f:
                                    f.write('')
                                with open(os.path.join(dl_dir, filename+'__None__-1.txt'), 'w') as f:
                                    f.write('')

                    except TimeoutException as ex:
                        pass
                    except:
                        pass   
                        
                    logging.info("Document download Pass for "+row['documentGuid']+", doc type="+doc_type+", url="+url)
                except Exception as inst:
                    logging.info("Document download Fail for "+row['documentGuid']+", doc type="+doc_type+", url="+url)
        driver.quit()
    logging.info("Stop collecting "+doc_type)
    #break

In [2]:
doc_types

array(['ADFRAWD', 'ADNPRM', 'AC', 'AB', 'POLICY', 'CAM', 'CAR', 'CANIC',
       'ADFREAD', 'ELOS', 'CFRFRSFAR', 'NORSEE', 'NPRM', 'PMA', 'SAIB',
       'SCFINAL', 'SCPROPOSED', 'SFAR', 'STC', 'TSOI', 'TSO', 'FAR',
       'TCDSMODEL', 'UNAPPROVED_PARTS_NOTIFICATIONS', 'INFO', 'SAFO',
       'ORDER_8900.1', 'CLARIFY_POLICY', 'AFS-1_MEMORANDUMS', 'AT_JTA',
       'AIRCRAFT_STANDARDIZED_CURRICULUM', 'AIRCRAFT_MASTER_SCHEDULE',
       'OTHER_AWO', 'ALERTS', 'OTHER_AWARDS_INFORMATION_GUIDES',
       '8900.1_EDITORIAL_CORRECTIONS', 'OTHER_EFB_RESEARCH_REPORTS',
       'OTHER_EFB_CHECKLISTS', 'OTHER_FAA_90_DAY_SAFETY_REVIEW',
       'OTHER_CPD_6.03', 'AFS_FFS_UPDATEPUB', 'AFS_FFS_UPDATES', 'FOEB',
       'FSB_REPORTS', 'AFS_POLICY_DEV_MEMOS',
       'OTHER_FLIGHT_STANDARDS_ORM_WORKSHEETS', 'AFS_FOCUS_TEAMS',
       'GA_JTA', 'BULLETINS', 'OTHER_PS_HANDBOOKS',
       'OTHER_INFORMATION_GUIDES', 'PILOT_QUALIFICATION_CURRICULUM',
       'OTHER_INTERNATIONAL_PUBLICATIONS', 'OTHER_JOB_AIDS',
      

In [6]:
metadata_mapping

Unnamed: 0,Service,Document Type name in DRS,Document Type Name in API request,Metadata Name in DRS,Metadata Name in API Response,Metadata Value Data Type,Deafult Sort By
0,AIR,AD Final Rules,ADFRAWD,AD Number,drs:documentNumber,TEXT,False
1,AIR,AD Final Rules,ADFRAWD,Status,drs:status,TEXT,False
2,AIR,AD Final Rules,ADFRAWD,Docket Number,drs:adfrawdDocketNo,TEXT,False
3,AIR,AD Final Rules,ADFRAWD,Amendment,drs:adfrawdAmendment,TEXT,False
4,AIR,AD Final Rules,ADFRAWD,Office of Primary Responsibility,drs:primaryRespOffice,ARRAY,False
...,...,...,...,...,...,...,...
1581,Regulations,Legal Interpretations,LEGAL_INTERPRETATIONS,AB Reference,drs:ABReference,ARRAY,False
1582,Regulations,Legal Interpretations,LEGAL_INTERPRETATIONS,AD Reference,drs:ADReference,ARRAY,False
1583,Regulations,Legal Interpretations,LEGAL_INTERPRETATIONS,CAR Reference,drs:CARReference,ARRAY,False
1584,Regulations,Legal Interpretations,LEGAL_INTERPRETATIONS,Exemption Reference,drs:EXReference,ARRAY,False


### Test if collect of aspecific DRS doc from FAA DRS using "Document Type Name in API request"

In [13]:
response.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [5]:
import requests
import pandas as pd

doc_type = 'AB'

url = "https://drs.faa.gov/api/drs/data-pull/"+doc_type

print(url)

headers = {'x-api-key':'3cc99314a05bcef0a82a3aeb7b95d031'}
response = requests.get(url, headers=headers)

print(response)

print(pd.DataFrame.from_dict(response.json()['documents']))
#print(response.json()['summary']['hasMoreItems'])
#print(response.json()['summary']['totalItems'])

https://drs.faa.gov/api/drs/data-pull/AB
<Response [403]>


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

### Collect full DRS doc index from FAA DRS

In [2]:
import logging
import requests
import os
import datetime
import time
import glob
import numpy as np

time_init = time.monotonic()
logging.basicConfig(filename='90_gendoc_collection.log', encoding='utf-8',format='%(asctime)s %(message)s', level=logging.INFO)


for doc_type in doc_types:
    df_drs = None
    hasMoreItems = True
    offset = 0
    logging.info("FileType: "+doc_type)

    while hasMoreItems:
        try:
            url = "https://drs.faa.gov/api/drs/data-pull/"+doc_type+"?offset="+str(offset)
            headers = {'x-api-key':'3cc99314a05bcef0a82a3aeb7b95d031'}
            response = requests.get(url, headers=headers)

            if df_drs is None:
                df_drs = pd.DataFrame.from_dict(response.json()['documents'], dtype="string")
            else:
                df_drs = pd.concat([df_drs, 
                                    pd.DataFrame.from_dict(response.json()['documents'], dtype="string")])
            
            hasMoreItems = response.json()['summary']['hasMoreItems'] and (offset<response.json()['summary']['totalItems'])
            logging.info("Passed with Offset: "+str(offset)+"/"+str(response.json()['summary']['totalItems'])+"; hasMoreItems="+str(response.json()['summary']['hasMoreItems'])+"; url: "+url)
            offset = offset + response.json()['summary']['count']
        except:
            logging.info("Failed with Offset: "+str(offset)+". DRS Error: "+response.json()['errorMessage']+"; url: "+url)
            hasMoreItems = False

    if df_drs is not None:
        df_drs.to_parquet(os.path.join(os.getcwd(),'data', 'DRS', 'index', doc_type+'_'+datetime.datetime.now().strftime("%Y%m%d - %H%M%S")+'.parquet'))

            

KeyError: 'errorMessage'

In [None]:
df_drsmeta = None

for doc_type in doc_types:
    parquet_files = sorted(glob.glob(os.path.join(os.getcwd(),'data', 'DRS', 'index',doc_type+'_202*.parquet')), 
                        key=os.path.getctime, 
                        reverse=True)

    if len(parquet_files)>0:
        this_parquet = pd.read_parquet(parquet_files[0])
        this_parquet.insert(0, "doc_type", doc_type)

        if df_drsmeta is None:
            df_drsmeta = this_parquet
        else:
            df_drsmeta = pd.concat([this_parquet,
                                    df_drsmeta])

### Just for fun - Collect from DRS using generated doc_types API names in case FAA didn't provide a complete file

In [3]:
import logging
import requests
import os
import datetime
import glob
import string

logging.basicConfig(filename='90_gendoc_collection.log', encoding='utf-8',format='%(asctime)s %(message)s', level=logging.INFO)

generated_doc_types = []
for letter_1 in list(string.ascii_uppercase):
    for letter_2 in list(string.ascii_uppercase):
        for letter_3 in list(string.ascii_uppercase):
            for letter_4 in list(string.ascii_uppercase):
                generated_doc_types.append(letter_1+letter_2+letter_3+letter_4)

for doc_type in generated_doc_types:
    if len(glob.glob(os.path.join(os.getcwd(),'database', 'all drs', doc_type+'*.parquet')))==0:
        df_drs = None
        hasMoreItems = True
        offset = 0
        logging.info("FileType: "+doc_type)

        while hasMoreItems:
            try:
                url = "https://drs.faa.gov/api/drs/data-pull/"+doc_type+"?offset="+str(offset)
                headers = {'x-api-key':'3cc99314a05bcef0a82a3aeb7b95d031'}
                response = requests.get(url, headers=headers)

                if df_drs is None:
                    df_drs = pd.DataFrame.from_dict(response.json()['documents'])
                else:
                    df_drs = pd.concat([df_drs, pd.DataFrame.from_dict(response.json()['documents'])])
                
                hasMoreItems = response.json()['summary']['hasMoreItems'] and (offset<response.json()['summary']['totalItems'])
                logging.info("Passed with Offset: "+str(offset)+"/"+str(response.json()['summary']['totalItems'])+"; hasMoreItems="+str(response.json()['summary']['hasMoreItems'])+"; url: "+url)
                offset = offset + response.json()['summary']['count']
            except:
                logging.info("Failed with Offset: "+str(offset)+". DRS Error: "+response.json()['errorMessage']+"; url: "+url)
                hasMoreItems = False

        if df_drs is not None:
            df_drs.to_parquet(os.path.join(os.getcwd(),'database','all drs',doc_type+'_'+datetime.datetime.now().strftime("%Y%m%d - %H%M%S")+'.parquet'))

## [OBSOLETE]  Collecting all DRS docs from FAA DRS

#### [OBSOLETE] Main function for DRS PDF collector 

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
import time
import datetime
import os.path

def collect_drs_pdf(url, driver, doc_type, Document_Type_name_in_DRS, filename):
    driver.get(url)
    dl_dir = os.path.join(os.getcwd(), 'database', 'data','all drs', doc_type, 'pdf')

    try:        
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[title*="'+Document_Type_name_in_DRS+'"]')) # '[title="'+Document_Type_name_in_DRS+'"]'
        )

        is_text_only = len(driver.find_elements(By.ID, "printArea"))>0

        if is_text_only:
            with open(os.path.join(dl_dir, filename+'.txt'), 'w') as f:
                f.write(driver.find_element(By.ID, "printArea").text)
        else:
            WebDriverWait(driver, 30).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#download"))
            )
            time_counter = 0
            while driver.find_element(By.CSS_SELECTOR, "#loadingBar").get_attribute("class")!="hidden" and time_counter<60:
                time.sleep(0.5)
                time_counter = time_counter + 1
            driver.find_element(By.CSS_SELECTOR, "#download").click()

            doc_name_from_site = driver.find_element(By.CLASS_NAME, "doc-view").find_element(By.CLASS_NAME, "ng-star-inserted").text
            if doc_name_from_site[0:2]=="- ":
                doc_name_from_site = doc_name_from_site[2:]

            time_counter = 0
            while (not os.path.exists(os.path.join(dl_dir, doc_name_from_site))) and time_counter<60:
                time.sleep(0.2)
                time_counter = time_counter + 1
            time.sleep(0.2)

            list_of_files = glob.glob(os.path.join(dl_dir,"*.pdf"))
            latest_file = sorted(list_of_files, key=os.path.getctime, reverse=True)[:10]

            if (doc_name_from_site in [os.path.basename(val) for val in latest_file]):
                os.rename(os.path.join(dl_dir, doc_name_from_site), os.path.join(dl_dir, filename+".pdf"))

    except TimeoutException as ex:
        pass
    except:
        pass        


#### [OBSOLETE] Collects all DRS PDF no optimization using the previously collected full STC list from FAA DRS

In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
import os
import logging
import re
import glob
import pandas as pd
import os

options = Options()
logging.basicConfig(filename='90_gendoc_collection.log', encoding='utf-8',format='%(asctime)s %(message)s', level=logging.INFO)


for doc_type in doc_types:
    doc_type = 'AC'

    dl_dir = os.path.join(os.getcwd(), 'database', 'data', 'all drs', doc_type, 'pdf')

    options.set_preference("browser.download.folderList", 2)
    options.set_preference("browser.download.manager.showWhenStarting", False)
    options.set_preference("browser.download.dir", dl_dir)
    options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")

    Document_Type_name_in_DRS = metadata_mapping[metadata_mapping["Document Type Name in API request"]==doc_type]["Document Type name in DRS "].unique()[0]
    Metadata_Names_in_DRS =  metadata_mapping[metadata_mapping["Document Type Name in API request"]==doc_type]["Metadata Name in DRS"]

    if len(glob.glob(os.path.join(os.getcwd(), 'database', 'all drs', doc_type+'*.parquet')))>0:
        logging.info("DRS PDF collector no optimization for "+doc_type)

        df_drs = pd.read_parquet(glob.glob(os.path.join(os.getcwd(),'database','all drs',doc_type+'*.parquet'))[-1])
        print(df_drs.shape)

        driver = webdriver.Firefox(options=options)
        driver.maximize_window()

        for index, row in df_drs.iterrows():
            if (not os.path.exists(os.path.join(dl_dir, row['drs:status']+"__"+row['documentGuid']+".pdf"))) and (not os.path.exists(os.path.join(dl_dir, row['drs:status']+"__"+row['documentGuid']+".txt"))):
                try:
                    filename = row['drs:status']+"__"+row['documentGuid']
                    collect_drs_pdf(row['documentURL']+"?modalOpened=true", driver, doc_type, Document_Type_name_in_DRS, filename)
                    #driver.close()
                        
                    logging.info("Document download Pass for "+row['documentGuid'])
                except Exception as inst:
                    logging.info("Document download Fail for "+row['documentGuid'])

        driver.quit()

(1725, 23)
(1725, 23)


### DEbug

Collect entries

In [2]:
import logging
import requests
import os
df_drs = None
logging.basicConfig(filename='drs_data_collection.log', encoding='utf-8',format='%(asctime)s %(message)s', level=logging.INFO)

for doc_type in doc_types:
    hasMoreItems = True
    offset = 0
    logging.info("FileType: "+doc_type)

    while hasMoreItems:
        url = "https://drs.faa.gov/api/drs/data-pull/"+doc_type+"?drs:offset="+str(offset)
        headers = {'x-api-key':'3cc99314a05bcef0a82a3aeb7b95d031'}
        response = requests.get(url, headers=headers)

        if df_drs is None:
            df_drs = pd.DataFrame.from_dict(response.json()['documents'])
        else:
            df_drs = pd.concat([df_drs, pd.DataFrame.from_dict(response.json()['documents'])])
        
        hasMoreItems = response.json()['summary']['hasMoreItems']
        offset = offset + response.json()['summary']['offset']
        logging.info("Offset: "+offset)

        df_drs.to_csv('drs.xlsx',index=False)


KeyboardInterrupt: 

Collect pdf

In [None]:
def get_next_candidate_headers(this_header_index, headers_pattern):
    candidate_headers = []
    for n in range(0,len(this_header_index)):
            if this_header_index[n]<0:
                candidate_headers.append([headers_pattern[n][this_header_index[n]+1],n])
                break
            else:
                candidate_headers.append([headers_pattern[n][this_header_index[n]+1],n])
    
    return candidate_headers

print(get_next_candidate_headers([1,1,-1,-1], headers_patterns['AC27']))
print(get_next_candidate_headers([0,-1,-1,-1], headers_patterns['AC27']))

In [None]:
import pdfplumber
import re

with pdfplumber.open(r'/home/victor/Desktop/FAA NLP Project/AC/AC 21-40A Guide for Obtaining a Supplemental Type Certificate.pdf') as pdf:
    text = ''.join([str(this_page.extract_text())+'\n' for n,this_page in enumerate(pdf.pages) if 4<n<7])

lines_breakdown = re.findall('\\n', text)

n_init=0
n_end=0
this_header_index = [-1,-1,-1]
candidate_headers = get_next_candidate_headers(this_header_index, headers_patterns['AC21-40A'])
headers = []
while n_init<len(text):
    this_match = re.search('\\n', text[n_init:])
    n_end = this_match.start()    
    if n_end>1:
        for m,candidate in enumerate(candidate_headers):
            x = re.search(candidate[0], text[n_init:n_init+n_end])
            if x is not None:
                headers.append([this_header_index,candidate,x,n_init,n_end])
                
                this_header_index[candidate[1]] = this_header_index[candidate[1]]+1
                this_header_index[(candidate[1]+1):] = [-1]*len(this_header_index[(candidate[1]+1):])
                candidate_headers = get_next_candidate_headers(this_header_index, headers_patterns['AC21-40A'])

    n_init = n_init+n_end+1
    
print(headers)
    




    