#### Convert pdfs tp text files

* Requirement: a AWS Account and an Amazon Textract license
* This is to convert the land-grant universities- related documents

USeful links:
* https://github.com/aws-samples/amazon-textract-textractor
* https://aws-samples.github.io/amazon-textract-textractor/notebooks/simple_ocr.html
* https://aws-samples.github.io/amazon-textract-textractor/notebooks/layout_analysis_for_text_linearization.html

In [25]:
import boto3
from glob import glob
import os
import utils as ut
import time
from datetime import timedelta

In [2]:
states_main_dir = "./for_mlm_ae_corpus/ext_univs_raw/pdfs"
CLEAN_OUT_DIR = "./for_mlm_ae_corpus/ext_univs_converted"

#### 1. get some stats

In [167]:
pdfs_dirpaths = []
records_pdf = dict()
records_txt = dict()
for i, dir in enumerate(glob(f"{states_main_dir}/*")):
    state = dir.split(os.path.sep)[-1]
    npdfs, total_pdfpages = ut.get_total_pdfpages_in_folder(dir)
    if npdfs > 0:
        pdfs_dirpaths.append(dir)
        records_pdf[state] = glob(f"{dir}/*.pdf")
    else:
        records_txt[state] = glob(f"{dir}/*.txt")
    print(f"{i:2d}, {state:<18}, {npdfs = : < 10}, {total_pdfpages = }")

 0, Massachusetts     , npdfs =  0        , total_pdfpages = 0
 1, New_Hampshire     , npdfs =  0        , total_pdfpages = 0
 2, Pennsylvania      , npdfs =  0        , total_pdfpages = 0
 3, Maine             , npdfs =  0        , total_pdfpages = 0
 4, Nebraska          , npdfs =  0        , total_pdfpages = 0
 5, Maryland          , npdfs =  3        , total_pdfpages = 16
 6, North_Dakota      , npdfs =  5        , total_pdfpages = 52
 7, Wisconsin         , npdfs =  34       , total_pdfpages = 578
 8, South_Carolina    , npdfs =  0        , total_pdfpages = 0
 9, Tennessee         , npdfs =  31       , total_pdfpages = 378
10, New_Mexico        , npdfs =  1        , total_pdfpages = 22
11, Colorado          , npdfs =  23       , total_pdfpages = 129
12, Arkansas          , npdfs =  70       , total_pdfpages = 496
13, Mississipi        , npdfs =  9        , total_pdfpages = 26
14, Texas             , npdfs =  32       , total_pdfpages = 294
15, Virginia          , npdfs =  4       



30, Utah              , npdfs =  23       , total_pdfpages = 264
31, Arizona           , npdfs =  38       , total_pdfpages = 582
32, Illinois          , npdfs =  2        , total_pdfpages = 56
33, Montana           , npdfs =  3        , total_pdfpages = 49
34, Washington        , npdfs =  8        , total_pdfpages = 93
35, Missouri          , npdfs =  1        , total_pdfpages = 330
36, Nevada            , npdfs =  1        , total_pdfpages = 15
37, South_Dakota      , npdfs =  56       , total_pdfpages = 440
38, Ohio              , npdfs =  0        , total_pdfpages = 0
39, Delaware          , npdfs =  0        , total_pdfpages = 0


In [None]:
import re

def clean_text(input_text):
    # Remove empty lines
    non_empty_lines = [line.strip() for line in input_text.splitlines() if line.strip()]

    # Define the list of keywords to stop processing lines after
    stop_keywords = ['Reviewers', 'Acknowledgements', 'Reference', 'References', 'Sources',]
    url_pattern = r'\b(?:http|https|ftp|www)\S*|\S+\.\S+\/\S*'
    www_pattern = r'^www\b'

    cleaned_lines = []
    skip = False

    for line in non_empty_lines:
        if line.startswith(('Author', 'Authors', 'The authors')):
            continue
        if any(keyword in line for keyword in stop_keywords):
            skip = True
        if not skip:
            if not re.search(url_pattern, line) or re.search(www_pattern, line):
                line_fixed = line.replace("- ", "")
                cleaned_lines.append(line_fixed)
    
    cleaned_text = '\n'.join(cleaned_lines)
    return cleaned_text

#### 2. AWS Config

In [None]:
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures

In [174]:
aws_region = 'us-east-1'
s3_client = boto3.client('s3', region_name=aws_region)
textract_client = boto3.client('textract', region_name=aws_region)

extractor_obj = Textractor(region_name=aws_region)

Create an S3 bucket and upload pdfs to it

In [48]:
def create_s3_bucket(s3, bucket_name, aws_region):
    try:
        # Create a bucket in the specified region
        s3.create_bucket(Bucket=bucket_name,)
        print(f"Bucket '{bucket_name}' created successfully in region '{aws_region}'.")
    except Exception as e:
        print(f"Error creating bucket '{bucket_name}': {str(e)}")

In [44]:
new_bucket_name = "ae-corpora-bucket"
create_s3_bucket(s3_client, new_bucket_name, aws_region)

Bucket 'ae-corpora-bucket' created successfully in region 'us-east-1'.


In [163]:
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures
from textractor.data.text_linearization_config import TextLinearizationConfig

def pdf2text_aws_textract(pdf_file_path, extractor_obj, s3_bucket_name, out_dir=CLEAN_OUT_DIR):

    if pdf_file_path is None or (pdf_file_path[-3:] not in ["pdf", "PDF"]):
        print(f"Not a pdf file: {pdf_file_path}")
        return
   
    try:
        document = extractor_obj.start_document_analysis(
            file_source=pdf_file_path,
            s3_upload_path=f"s3://{s3_bucket_name}/",
            features=[TextractFeatures.LAYOUT],
        )

        config_postprocess = TextLinearizationConfig(
            hide_figure_layout=True,
            hide_header_layout=True,
            hide_footer_layout=True,
            hide_page_num_layout=True,
            linearize_table=False,
        )

        cleaned_text = document.get_text(config=config_postprocess)
        state = pdf_file_path.split(os.path.sep)[-2]
        fname = pdf_file_path.split(os.path.sep)[-1].replace("pdf", "txt")
        txt_outpath = os.path.join(out_dir, f"{state}_{fname}")
        
        with open(txt_outpath, 'w', encoding='utf-8') as text_file:
            text_file.write(clean_text(cleaned_text))

        print(f"Successfully created: {txt_outpath}")
        return txt_outpath
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")


def pdf2text_aws_textract_in_batch(state, extractor_obj, s3_bucket_name, records, out_dir=CLEAN_OUT_DIR):
    pdfs_list_state = records.get(state, None)
    for pdf_file_path in pdfs_list_state:
        pdf2text_aws_textract(pdf_file_path, extractor_obj, s3_bucket_name)
    

def simple_txt2txt(txt_file_path, out_dir=CLEAN_OUT_DIR):
    if txt_file_path[-3:] != "txt":
        print(f"Not a txt file: {txt_file_path}")
        return

    fname = txt_file_path.split(os.path.sep)[-1]
    state = txt_file_path.split(os.path.sep)[-2]
    txt_outpath = os.path.join(out_dir, f"{state}_{fname}")

    with open(txt_file_path, "r") as fr:
        with open(txt_outpath, 'w', encoding='utf-8') as text_file:
            text_file.write(clean_text(fr.read()))   

    return txt_outpath

Begin extraction

In [168]:
sorted_records_pdf = dict(sorted(records_pdf.items(), key=lambda item: len(item[1])))

In [172]:
for state, pdfs_list_state in sorted_records_pdf.items(): 
    print(f"Will begin conversion of {len(pdfs_list_state):3} documents for: {state:<18}")
    start_time = time.monotonic()
    pdf2text_aws_textract_in_batch(state, extractor_obj, new_bucket_name, records_pdf, out_dir=CLEAN_OUT_DIR)
    duration = timedelta(seconds=time.monotonic() - start_time)
    print(f"Conversion took: {duration}")
    print("----------------------------------------------------------------\n")

Will begin conversion of   1 documents for: New_Mexico        
Will begin conversion of   1 documents for: Minnesota         
Will begin conversion of   1 documents for: Missouri          
Will begin conversion of   1 documents for: Nevada            
Will begin conversion of   2 documents for: Michigan          
Will begin conversion of   2 documents for: Illinois          
Will begin conversion of   3 documents for: Maryland          
Will begin conversion of   3 documents for: Montana           
Will begin conversion of   4 documents for: Virginia          
Will begin conversion of   5 documents for: North_Dakota      
Will begin conversion of   5 documents for: Wyoming           
Will begin conversion of   8 documents for: New_Jersey        
Will begin conversion of   8 documents for: Washington        
Will begin conversion of   9 documents for: Mississipi        
Will begin conversion of  13 documents for: North_Carolina    
Will begin conversion of  18 documents for: Oregon     