#### Convert pdfs tp text files

* This is to convert the documents from the Jornal of Extension to text

In [25]:
from textractor import Textractor
from textractor.visualizers.entitylist import EntityList
from textractor.data.constants import TextractFeatures
import time
from datetime import timedelta
from bs4 import BeautifulSoup
import re
import os
import sys
import boto3
from glob import glob
sys.path.append("..")
import utils as ut
import chardet

In [31]:
def get_encoding(file_path):
    with open(file_path, 'rb') as file:
        return chardet.detect(file.read())['encoding']
    

def html_to_text_file_joe(html_file, out_dir):
    """Convert journal of extension-related html file to text file"""

    fname = html_file.split(os.path.sep)[-1].replace("html", "txt")
    output_file = os.path.join(out_dir, f"{fname}")

    if os.path.exists(output_file):
        print(f"\html_to_text_file_joe(): {output_file} exists already. Skipping\n")
        return

    encoding = get_encoding(html_file)

    # Read the HTML file
    with open(html_file, 'r', encoding=encoding) as file:
        html_content = file.read()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Define tags to be skipped
    skip_tags = ['DIV-author', 'DIV-copyright-footer', 'footer', 'footer-container']

    # Split paragraphs into sentences and write each sentence in a new line
    with open(output_file, 'w', encoding='utf-8') as file:
        for p in soup.find_all('p'):
            if not any(parent.has_attr('class') and skip_tag in parent['class'] for parent in p.parents for skip_tag in skip_tags):
                if not any(parent.has_attr('id') and parent['id'] in skip_tags for parent in p.parents):
                    # Check if paragraph contains any links (<a> tags), if so, skip it
                    if p.find('a'):
                        continue

                    # Remove newlines within a paragraph and strip leading/trailing spaces
                    paragraph_text = ' '.join(p.get_text().split())
                    # Splitting the paragraph into sentences
                    sentences = re.split(r'(?<=[.!?])\s+', paragraph_text)
                    for sentence in sentences:
                        file.write(sentence + '\n')

In [30]:
def pdf_to_text_file_joe(pdf_file_path, extractor_obj, s3_bucket_name, out_dir):
    got_text = ut.pdf2text_aws_textract(pdf_file_path, extractor_obj, s3_bucket_name, out_dir)

    fname = pdf_file_path.split(os.path.sep)[-1].replace("pdf", "txt")
    output_file = os.path.join(out_dir, f"{fname}")

    if os.path.exists(output_file):
        print(f"\npdf_to_text_file_joe(): {output_file} exists already. Skipping\n")
        return

    with open(output_file, 'w', encoding='utf-8') as text_file:
            # Remove newlines within a paragraph and strip leading/trailing spaces
            paragraph_text = ' '.join(got_text.split())
            # Splitting the paragraph into sentences
            sentences = re.split(r'(?<=[.!?])\s+', paragraph_text)
            for sentence in sentences:
                text_file.write(sentence + '\n')

In [19]:
aws_region = 'us-east-1'
s3_client = boto3.client('s3', region_name=aws_region)
textract_client = boto3.client('textract', region_name=aws_region)

extractor_obj = Textractor(region_name=aws_region)
s3_bucket_name = "ae-corpora-bucket"
out_dir = r"./for_mlm_ae_corpus/joe_converted"

In [None]:
joe_raw_fpath = r"./for_mlm_ae_corpus/joe_raw"
joe_converted_outdir = r"./for_mlm_ae_corpus/joe_converted"

joe_raw_files_list = glob(f"{joe_raw_fpath}/*")

start_time = time.monotonic()
for i, file in enumerate(joe_raw_files_list, start=1):
    fext = file.split(".")[-1]
    if fext in ["pdf", "PDF"]:
        pdf_to_text_file_joe(file, extractor_obj, s3_bucket_name, out_dir)
    elif fext in ["html", "htm"]:
        html_to_text_file_joe(file, out_dir)
    else:
        print(f"File extension not recognized: {file[-3:]}. Skipping")

duration = timedelta(seconds=time.monotonic() - start_time)
print(f"Conversion took: {duration}")    

joe_conv_files_list = glob(f"{joe_converted_outdir}/*")

print(f"To convert: {len(joe_raw_files_list)}")
print(f"Converted : {len(joe_conv_files_list)}")
