## I. Filter and copy altered versions that were not properly decoded by google vision

In [12]:
import pandas as pd
import numpy as np
import os

df_stc = pd.read_parquet(os.path.join(os.getcwd(),'data','STC','stc_metadata_and_text.parquet'))

print(df_stc.shape)
df_stc = df_stc.drop(df_stc.index[[0]])
print(df_stc.shape)

start_dates = pd.to_datetime(pd.DataFrame({'year': np.linspace(1950, 2020, 8)})['year'], format="%Y").dt.date
end_dates = pd.to_datetime(pd.DataFrame({'year': np.linspace(1960, 2030, 8)})['year'], format="%Y").dt.date

(43818, 30)
(43808, 30)


In [13]:
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['figure.dpi'] = 100
plt.rcParams['figure.figsize'] = 6.4, 4.8

for content in ['descriptions', 'limitations']:
    df_stc[content+' totalwords'] = df_stc[content].str.split().str.len()
    print(df_stc[content+' totalwords'].describe())   

count    43808.000000
mean        45.886687
std         34.376243
min          0.000000
25%         28.000000
50%         39.000000
75%         58.000000
max       1285.000000
Name: descriptions totalwords, dtype: float64
count    43808.000000
mean        91.006232
std         54.277106
min          0.000000
25%         54.000000
50%         83.000000
75%        118.000000
max        927.000000
Name: limitations totalwords, dtype: float64


In [14]:
min_number_of_words = 5 # 20 'etait beaucoup trop severe

df_deficient_stc = df_stc.loc[(df_stc['descriptions totalwords']<min_number_of_words) | (df_stc['limitations totalwords']<min_number_of_words)]
df_deficient_stc.shape

(4049, 32)

In [19]:
import glob
import sys
import fitz
import os
from PIL import Image

# To get better resolution
zoom_x = 1.7
zoom_y = 1.7
mat = fitz.Matrix(zoom_x, zoom_y)

pdf_path = r'C:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\STC\raw data\pdf'
pdf_path_deficient = r'C:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\STC\raw data\pdf-deficient_minwords='+str(min_number_of_words)

for index, row in df_deficient_stc.iterrows():
    stc_filename = row['drs:chronicleId']+'__'+row['drs:status']+'__'+row['documentGuid']
    stc_path = os.path.join(pdf_path, stc_filename+'.pdf')
    if os.path.exists(stc_path):
        doc = fitz.open(stc_path)

        stc_image_paths = []

        for page in doc:
            stc_image_paths.append(os.path.join(r"C:\Users\victor\Downloads", stc_filename+"-%i.jpeg" % page.number))
            pix = page.get_pixmap(matrix=mat)
            pix.save(stc_image_paths[-1])

        images = [Image.open(f) for f in stc_image_paths]
        images[0].save(os.path.join(pdf_path_deficient, stc_filename+'.pdf'), "PDF" , resolution=100.0, save_all=True, append_images=images[1:])

        for image_path in stc_image_paths:
            os.remove(image_path)

        break

## II. Upload those deficient STCs to google cloud
* https://console.cloud.google.com/storage/browser/faa-drs/

In [3]:
from google.cloud import storage


def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    # bucket_name = "your-bucket-name"

    storage_client = storage.Client()

    # Note: Client.list_blobs requires at least package version 1.17.0.
    blobs = storage_client.list_blobs(bucket_name)

    # Note: The call returns a response only when the iterator is consumed.
    #for blob in blobs:
    #    print(blob.name)

    return [blob.name for blob in blobs]

In [4]:
from google.cloud import storage


def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Optional: set a generation-match precondition to avoid potential race conditions
    # and data corruptions. The request to upload is aborted if the object's
    # generation number does not match your precondition. For a destination
    # object that does not yet exist, set the if_generation_match precondition to 0.
    # If the destination object already exists in your bucket, set instead a
    # generation-match precondition using its generation number.
    generation_match_precondition = 0

    blob.upload_from_filename(source_file_name, if_generation_match=generation_match_precondition)

    print(
        f"File {source_file_name} uploaded to {destination_blob_name}."
    )

In [3]:
import os
import glob

dl_dir = os.path.join(os.path.join(os.getcwd(),'data','STC','raw data','pdf-deficient'))

uploaded_pdfs = [os.path.basename(file) for file in list_blobs("faa-drs") if file[:14]=="pdf-deficient/"]
list_of_real_pdfs = [file for file in glob.glob(os.path.join(dl_dir,"*.pdf")) if os.path.getsize(file)>200 and os.path.basename(file) not in uploaded_pdfs]

for filename in list_of_real_pdfs:
    upload_blob("faa-drs", 
                filename, 
                "pdf-deficient/"+os.path.basename(filename))

File c:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\STC\raw data\pdf-deficient\SA2300CE__Current__C6F3368F5999BA59862580F000627A6A.pdf uploaded to pdf-deficient/SA2300CE__Current__C6F3368F5999BA59862580F000627A6A.pdf.
File c:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\STC\raw data\pdf-deficient\SA2300CE__Historical__9B3BF77BD306A14A86256F7E0056CC97.pdf uploaded to pdf-deficient/SA2300CE__Historical__9B3BF77BD306A14A86256F7E0056CC97.pdf.
File c:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\STC\raw data\pdf-deficient\SA2301SW__Current__46FC114950AB355E86258202006B5C0A.pdf uploaded to pdf-deficient/SA2301SW__Current__46FC114950AB355E86258202006B5C0A.pdf.
File c:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\STC\raw data\pdf-deficient\SA2301WE__Current__E5B3964FEB66552986257D7B006C156C.pdf uploaded to pdf-deficient/SA2301WE__Current__E5B3964FEB66552986257D7B006C156C.pdf.
File c:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\STC\r

## III. Retrieve STC text database using Google Vision

In [1]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = "application/pdf"

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION) # DOCUMENT_TEXT_DETECTION

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size
    )

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config, output_config=output_config
    )

    operation = client.async_batch_annotate_files(requests=[async_request])

    print("Waiting for the operation to finish.")
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix, filtering out folders.
    blob_list = [
        blob
        for blob in list(bucket.list_blobs(prefix=prefix))
        if not blob.name.endswith("/")
    ]
    print("Output files:")
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    output = blob_list[0]

    json_string = output.download_as_bytes().decode("utf-8")
    response = json.loads(json_string)

    # The actual response for the first page of the input file.
    first_page_response = response["responses"][0]
    annotation = first_page_response["fullTextAnnotation"]

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    # print("Full text:\n")
    # print(annotation["text"])

    outputs = []
    for blob in blob_list:
        response = json.loads(blob.download_as_string())
        outputs.append("\n\n".join([page_response['fullTextAnnotation']['text'] 
                                    for page_response in response['responses'] 
                                    if ('fullTextAnnotation' in page_response)]))

    return "\n\n".join(outputs)

### Apply the Google async_detect_document function

In [5]:
import os
import re
import glob
import random

text_dir = os.path.join(os.getcwd(),'data','STC','raw data','json-from-deficient-pdf')

list_of_texts = [os.path.basename(file)[:-4] for file in glob.glob(os.path.join(text_dir,"*.txt"))]
all_blobs = list_blobs("faa-drs")
all_blobs = [val for val in all_blobs if val[:14]=='pdf-deficient/']

print(str(len([blob for blob in all_blobs if ((blob[:4]=="pdf-deficient/") and (blob[-4:]==".pdf") and (os.path.basename(blob)[:-4] not in list_of_texts))]))+" missing over "+str(len(all_blobs)))

random.shuffle(all_blobs)

for n, blob in enumerate(all_blobs):
    if (blob[:14]=="pdf-deficient/") and (blob[-4:]==".pdf") and (os.path.basename(blob)[:-4] not in list_of_texts):
        try:
            print(blob)
            extracted_text = async_detect_document("gs://faa-drs/"+blob,
                                                    re.sub("faa-drs/pdf-deficient",
                                                            "faa-drs/text-from-pdf-deficient",
                                                            "gs://faa-drs/"+blob[:-4]+'___')
                                                )
            with open(os.path.join(text_dir, os.path.basename(blob[:-4])+".txt"), 'w') as f:
                f.write(extracted_text)
        except:
            print("it failed :(")

    if n>200:
        break

0 missing over 2055
pdf-deficient/SA2058NM__Current__498A9EEBA9E02832862579ED005511ED.pdf
Waiting for the operation to finish.
Output files:
text-from-pdf-deficient/SA2058NM__Current__498A9EEBA9E02832862579ED005511ED___output-1-to-1.json
pdf-deficient/SA1363SW__Current__FB144426E87984B585256CC100820F9E.pdf
Waiting for the operation to finish.
Output files:
text-from-pdf-deficient/SA1363SW__Current__FB144426E87984B585256CC100820F9E___output-1-to-2.json
pdf-deficient/SA1126SW__Current__77C27002B7CE9F3C86257D41004F7601.pdf
Waiting for the operation to finish.
Output files:
text-from-pdf-deficient/SA1126SW__Current__77C27002B7CE9F3C86257D41004F7601___output-1-to-2.json
pdf-deficient/SA09433AC-D__Current__021AC1A22D6FCAB185256CC1007F3AFD.pdf
Waiting for the operation to finish.
Output files:
text-from-pdf-deficient/SA09433AC-D__Current__021AC1A22D6FCAB185256CC1007F3AFD___output-1-to-2.json
text-from-pdf-deficient/SA09433AC-D__Current__021AC1A22D6FCAB185256CC1007F3AFD___output-3-to-3.json
pd

## III. Check results

In [2]:
import pandas as pd
import numpy as np
df_stc = pd.read_parquet(r"C:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\STC\stc_metadata_and_text.parquet")
for content in ['descriptions', 'limitations']:
    df_stc[content+' totalwords'] = df_stc[content].str.split().str.len()

In [11]:
import glob
import subprocess
import os

documentGuid = '261B0B34066C1DE9862586BD005BB581'
stc_path = r'C:\Users\victor\Documents\DeepLearning\FAA NLP Project\data\STC\raw data'

subprocess.Popen(glob.glob(os.path.join(stc_path, 'pdf','*'+documentGuid+'*'))[-1], shell=True)
subprocess.Popen(glob.glob(os.path.join(stc_path, 'pdf-deficient','*'+documentGuid+'*'))[-1], shell=True)

print('=================TEXT FROM ORIGINAL=====================')
print("Description length: %d" % (df_stc[df_stc['documentGuid']==documentGuid]['descriptions totalwords'].iloc[0]))
print("Limications length: %d" % (df_stc[df_stc['documentGuid']==documentGuid]['limitations totalwords'].iloc[0]))

print('=================TEXT FROM ORIGINAL=====================')
das_file = glob.glob(os.path.join(stc_path, 'text-from-pdf-alternate','*'+documentGuid+'*.json'))
if len(das_file)>0:
    das_file = das_file[-1]
    print(das_file)
    with open(das_file,'rb') as f:
        print(f.read())
else:
    print(df_stc[df_stc['documentGuid']==documentGuid]['rawtext'].iloc[0])

print('=================TEXT FROM BLURRED=====================')
das_file = glob.glob(os.path.join(stc_path, 'json-from-deficient-pdf','*'+documentGuid+'*'))[-1]
print(das_file)
with open(das_file,'r') as f:
    print(f.read())

Description length: 81
Limications length: 15
FEDERAL
ADMINIS
AVIATION
United States of America
Department of Transportation
Federal Aviation Administration
Supplemental Type Certificate
his certificate issued to: BHE & Associates, Ltd.
Number: SA11219SC
12002 Warfield St., Suite 250
San Antonio, Texas 78216
Certifies that the change in the type design for the following product with the limitations and conditions
therefore as specified hereon meets the airworthiness requirements of Part 23 of Code of Federal Regulations
Make: Textron Aviation Inc.
Original Product
A1WI
Type Certificate Number:
Model: 525B, 525A
Date of Issuance:
Description of Type Design Change:
Installation of Rockwell Collins Pro Line Fusion Ⓡ embedded display system (EDS) in accordance with
Master Data List 560-00-0001, revision IR dated 04/19/2017, or later FAA approved revision. Airplane
Flight Manual Supplement 560-00-0098 (525B), revision IR dated 04/19/2017 and/or Airplane Flight
Manual Supplement 598-02-0102 