## I. Decoding STC using Python's library pdfplumber

### Example use of library pdfplumber on one STC pdf

In [10]:
import pdfplumber
import os

with pdfplumber.open(os.path.join(os.getcwd(),'data','STC','raw data','pdf','SA2322CE-D__Current__A174E8B29C3F2AA6862585D800611438.pdf')) as pdf: # SB04185CH  SR04557NY
    pdf_pages = [str(this_page.extract_text())+'\n' for this_page in pdf.pages]

print(pdf_pages[0])

United States of America
Department of Transportation
Federal Aviation Administration
Supplemental Type Certificate
Number:
SA2322CE-D
This certificate issued to: Honeywell International Inc.
21111 N 19th Ave Phoenix
Arizona 85027
certifies that the change in the type design for the following product with the limitations and conditions therefore as
4b
specified hereon meets the airworthiness requirements of Part of the CivilAirRegulations.
OriginalProduct–TypeCertificateNumber: Make:AvionisMarcelDassault(AMD)
Model:
A7EU FanjetFalconSeriesC,D,E,F
Mystere-Falcon20-C5,20-D5,20-E5,20-F5
DescriptionofTypeDesignChange:
InstallationofdualBendix/KingKFC400AutomaticFlightControlSystems.
REQUIREDDATA:1.MasterDrawingList155-9513-01,Rev.7,dated5-90and2.AirplaneFlightManualSupplement
006-00499-0000,Rev.1,dated6-14-90orlaterFAAapprovedrevisionto1or2.
LimitationsandConditions:
1.CompliancemustbeshownwithapplicableServiceBulletinsandairplanemodificationsaslistedontheRequired
Aircraft Modifications Do

## II. Create STC text database using Google Vision

### 1. Upload local files to google storage
https://console.cloud.google.com/storage/browser/faa-drs;tab=objects?project=drs-stc&prefix=&forceOnObjectsSortingFiltering=false

In [12]:
from google.cloud import storage
storage_client = storage.Client()
blobs = storage_client.list_blobs('faa-drs')

In [13]:
for blob in blobs:
   print(blob.name)

stc/
stc/SA2322CE-D__Current__A174E8B29C3F2AA6862585D800611438.pdf


In [15]:
from google.cloud import storage


def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    # bucket_name = "your-bucket-name"

    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)

    # Note: The call returns a response only when the iterator is consumed.
    #for blob in blobs:
    #    print(blob.name)

    return [blob.name for blob in blobs]

In [3]:
from google.cloud import storage


def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Optional: set a generation-match precondition to avoid potential race conditions
    # and data corruptions. The request to upload is aborted if the object's
    # generation number does not match your precondition. For a destination
    # object that does not yet exist, set the if_generation_match precondition to 0.
    # If the destination object already exists in your bucket, set instead a
    # generation-match precondition using its generation number.
    generation_match_precondition = 0

    blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )

In [16]:
list_blobs("faa-drs")

['SA2322CE-D__Current__A174E8B29C3F2AA6862585D800611438.pdf']

In [1]:
import os
import glob

dl_dir = os.path.join(os.path.join(os.getcwd(),'database','data','stc','pdf'))

uploaded_pdfs = list_blobs("faa-drs")

uploaded_pdfs = [os.path.basename(file) for file in list_blobs("faa-drs") if file[:4]=="pdf/"]
list_of_real_pdfs = [file for file in glob.glob(os.path.join(dl_dir,"*.pdf")) if os.path.getsize(file)>200 and os.path.basename(file) not in uploaded_pdfs]

for filename in list_of_real_pdfs:
    upload_blob("faa-drs", 
                filename, 
                "pdf/"+os.path.basename(filename))

NameError: name 'list_blobs' is not defined

### 2. PDF to Text using Google Vision
From https://cloud.google.com/vision/docs/pdf#vision_text_detection_pdf_gcs-python:
* https://cloud.google.com/storage/pricing
* https://cloud.google.com/vision/docs/pdf
* gcloud projects create drs-stc
* gcloud config set project drs-stc
* gcloud services enable vision.googleapis.com
* gcloud projects add-iam-policy-binding drs-stc --member="user:victor.girondin@gmail.com" --role=roles/owner

In [43]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = "application/pdf"

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) # DOCUMENT_TEXT_DETECTION

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size
    )

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config, output_config=output_config
    )

    operation = client.async_batch_annotate_files(requests=[async_request])

    print("Waiting for the operation to finish.")
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix, filtering out folders.
    blob_list = [
        blob
        for blob in list(bucket.list_blobs(prefix=prefix))
        if not blob.name.endswith("/")
    ]
    print("Output files:")
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_bytes().decode("utf-8")
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response["responses"][0]
    annotation = first_page_response["fullTextAnnotation"]

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    # print("Full text:\n")
    # print(annotation["text"])

    outputs = []
    for blob in blob_list:
        response = json.loads(blob.download_as_string())
        outputs.append("\n\n".join([page_response['fullTextAnnotation']['text'] 
                                    for page_response in response['responses'] 
                                    if ('fullTextAnnotation' in page_response)]))

    return "\n\n".join(outputs)

### A simple example just try

In [48]:
import os
import json
import re
from google.cloud import vision
from google.cloud import storage

toto = async_detect_document("gs://faa-drs/bbd1.pdf", "gs://faa-drs/bbd1_")

print(toto)

text_dir = r'C:\Users\victor\Downloads'#'C:\Users\victor\Documents\DeepLearning\FAA NLP Project\database\data\stc\text-from-pdf'

# with open(os.path.join(text_dir, "SA2322CE-D__Current__A174E8B29C3F2AA6862585D800611438.txt"), 'w') as f:
#     f.write(toto)

Waiting for the operation to finish.
Output files:
bbd1_output-1-to-2.json
bbd1_output-3-to-3.json
FEDER
NOLL
This certificate issued to: Honeywell International Inc.
21111 N 19th Ave Phoenix
Arizona 85027
A7EU
United States of America
Department of Transportation
Federal Aviation Administration
Supplemental Type Certificate
certifies that the change in the type design for the following product with the limitations and conditions therefore as
specified hereon meets the airworthiness requirements of Part 46 of the Civil Air Regulations.
Original Product Type Certificate Number:
Description of Type Design Change:
Number: SA2322CE-D
Date of Application: 11-9-89
Make: Avionis Marcel Dassault (AMD)
Model:
Installation of dual Bendix/King KFC 400 Automatic Flight Control Systems.
REQUIRED DATA: 1. Master Drawing List 155-9513-01, Rev. 7, dated 5-90 and 2. Airplane Flight Manual Supplement
006-00499-0000, Rev. 1, dated 6-14-90 or later FAA approved revision to 1 or 2.
Date of Issuance: 4-19-8

### Go throught the pdfs in google cloud, apply vision api and retrieve the text (download_as_string)

In [None]:
import os
import re
import glob
import random

text_dir = os.path.join(os.getcwd(),'database','data','stc','text-from-pdf')

list_of_texts = [os.path.basename(file)[:-4] for file in glob.glob(os.path.join(text_dir,"*.txt"))]
all_blobs = list_blobs("faa-drs")

print(str(len([blob for blob in all_blobs if ((blob[:4]=="pdf/") and (blob[-4:]==".pdf") and (os.path.basename(blob)[:-4] not in list_of_texts))]))+" missing over "+str(len(all_blobs)))

random.shuffle(all_blobs)

for blob in all_blobs:
    if (blob[:4]=="pdf/") and (blob[-4:]==".pdf") and (os.path.basename(blob)[:-4] not in list_of_texts):
        try:
            print(blob)
            extracted_text = async_detect_document("gs://faa-drs/"+blob,
                                                re.sub("faa-drs/pdf",
                                                        "faa-drs/text-from-pdf",
                                                        "gs://faa-drs/"+blob[:-4])
                                                )
            with open(os.path.join(text_dir, os.path.basename(blob[:-4])+".txt"), 'w') as f:
                f.write(extracted_text)
        except:
            print("it failed :(")

### Retrieve the json from Google cloud

In [None]:
import os
import glob
from google.cloud import storage

json_dir = os.path.join(os.getcwd(),'database','data','stc','json-from-pdf')
list_of_jsons = [os.path.basename(file)[:-5] for file in glob.glob(os.path.join(json_dir,"*.json"))]
#all_blobs = list_blobs("faa-drs")

storage_client = storage.Client()
bucket = storage_client.bucket("faa-drs")

for blob_name in all_blobs:
    if (blob_name[:14]=="text-from-pdf/") and (blob_name[-5:]==".json") and (os.path.basename(blob_name)[:-5] not in list_of_jsons):
        try:
            print(blob_name)
            blob = bucket.blob(blob_name)
            blob.download_to_filename(os.path.join(json_dir, os.path.basename(blob_name)))
        except:
            print("it failed :(")

### Go throught the locally stored pdfs and retrieve the text (alternative using download_as_text)

In [7]:
import os
import pandas as pd
df_stc = pd.read_excel(os.path.join(os.getcwd(),'database','stc.xlsx'))
df_stc = df_stc.drop_duplicates()
df_stc['drs:stcHolder'] = df_stc['drs:stcHolder'].map(lambda x: x.replace(", Inc.","").replace(", Inc","").replace(" Inc.","").replace(" Inc",""))

print(df_stc.shape)

(77661, 26)


In [None]:
from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.bucket("faa-drs")

blob_list = [blob for blob in list(bucket.list_blobs(prefix='text-from-pdf')) if not blob.name.endswith('/')]

In [None]:
import os
import glob
import json

text_dir = os.path.join(os.getcwd(),'database','data','stc','text-from-pdf-alternate')
processed_pdfs = glob.glob(os.path.join(text_dir,'*.txt'))

for documentGuid in list(df_stc['documentGuid']):
    matching_blobs = [blob for blob in blob_list if documentGuid in blob.name]

    if len([val for val in processed_pdfs if documentGuid in val])==0:
        outputs = []

        for blob in matching_blobs:
            response = json.loads(blob.download_as_text())
            outputs.append("\n\n".join([page_response['fullTextAnnotation']['text'] for page_response in response['responses'] if ('fullTextAnnotation' in page_response)]))
            
        outputs = "\n\n".join(outputs)

        with open(os.path.join(text_dir, blob.name[14:-5].split("output")[0]+".txt"), 'w') as f:
            f.write(outputs)

## III. Google Vision OCR on all DRS

### 1. Upload local files to google storage
https://console.cloud.google.com/storage/browser/faa-drs;tab=objects?project=drs-stc&prefix=&forceOnObjectsSortingFiltering=false

In [1]:
from google.cloud import storage


def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    # bucket_name = "your-bucket-name"

    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)

    # Note: The call returns a response only when the iterator is consumed.
    #for blob in blobs:
    #    print(blob.name)

    return [blob.name for blob in blobs]                

: 

In [None]:
from google.cloud import storage


def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Optional: set a generation-match precondition to avoid potential race conditions
    # and data corruptions. The request to upload is aborted if the object's
    # generation number does not match your precondition. For a destination
    # object that does not yet exist, set the if_generation_match precondition to 0.
    # If the destination object already exists in your bucket, set instead a
    # generation-match precondition using its generation number.
    generation_match_precondition = 0

    blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )

In [None]:
import os
import glob

dl_dir = os.path.join(os.path.join(os.getcwd(),'database','data','stc','pdf'))

uploaded_pdfs = list_blobs("faa-drs")

uploaded_pdfs = [os.path.basename(file) for file in list_blobs("faa-drs") if file[:4]=="pdf/"]
list_of_real_pdfs = [file for file in glob.glob(os.path.join(dl_dir,"*.pdf")) if os.path.getsize(file)>200 and os.path.basename(file) not in uploaded_pdfs]

for filename in list_of_real_pdfs:
    upload_blob("faa-drs", 
                filename, 
                "pdf/"+os.path.basename(filename))

### 2. PDF to Text using Google Vision
From https://cloud.google.com/vision/docs/pdf#vision_text_detection_pdf_gcs-python:
* https://cloud.google.com/storage/pricing
* https://cloud.google.com/vision/pricing
* https://cloud.google.com/vision/docs/pdf
* gcloud projects create drs-stc
* gcloud config set project drs-stc
* gcloud services enable vision.googleapis.com
* gcloud projects add-iam-policy-binding drs-stc --member="user:victor.girondin@gmail.com" --role=roles/owner

In [None]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    #print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix, filtering out folders.
    blob_list = [blob for blob in list(bucket.list_blobs(
        prefix=prefix)) if not blob.name.endswith('/')]
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    #output = blob_list[0]

    #json_string = output.download_as_string()
    #response = json.loads(json_string)

    # The actual response for the first page of the input file.
    #first_page_response = response['responses'][0]
    #annotation = first_page_response['fullTextAnnotation']

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    #print('Full text:\n')
    #print(annotation['text'])

    outputs = []
    for blob in blob_list:
        response = json.loads(blob.download_as_string())
        outputs.append("\n\n".join([page_response['fullTextAnnotation']['text'] 
                                    for page_response in response['responses'] 
                                    if ('fullTextAnnotation' in page_response)]))

    return "\n\n".join(outputs)

### Go throught the pdfs in google cloud, apply vision api and retrieve the text (download_as_string)

In [None]:
import os
import pandas as pd
df_stc = pd.read_excel(os.path.join(os.getcwd(),'database','stc.xlsx'))
df_stc = df_stc.drop_duplicates()
df_stc['drs:stcHolder'] = df_stc['drs:stcHolder'].map(lambda x: x.replace(", Inc.","").replace(", Inc","").replace(" Inc.","").replace(" Inc",""))

print(df_stc.shape)

In [None]:
from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.bucket("faa-drs")

blob_list = [blob for blob in list(bucket.list_blobs(prefix='text-from-pdf')) if not blob.name.endswith('/')]

In [None]:
import os
import glob
import json

text_dir = os.path.join(os.getcwd(),'database','data','stc','text-from-pdf-alternate')
processed_pdfs = glob.glob(os.path.join(text_dir,'*.txt'))

for documentGuid in list(df_stc['documentGuid']):
    matching_blobs = [blob for blob in blob_list if documentGuid in blob.name]

    if len([val for val in processed_pdfs if documentGuid in val])==0:
        outputs = []

        for blob in matching_blobs:
            response = json.loads(blob.download_as_text())
            outputs.append("\n\n".join([page_response['fullTextAnnotation']['text'] for page_response in response['responses'] if ('fullTextAnnotation' in page_response)]))
            
        outputs = "\n\n".join(outputs)

        with open(os.path.join(text_dir, blob.name[14:-5].split("output")[0]+".txt"), 'w') as f:
            f.write(outputs)