In [None]:
import io
import os
import warnings
from pathlib import Path
import pandas as pd
from tesserocr import OEM,PSM, PyTessBaseAPI
from tqdm.notebook import tqdm

import ocr_utils
import sys
sys.path.append('table-transformer-tatr')

from src.inference import TableExtractionPipeline

warnings.filterwarnings('ignore')

In [None]:
# Variables that need to be made
# pdf_directory = 'test'
directory = 'test'
subdirectories = ['csvs', 'processed_txts', 'txts']

In [None]:
# Instantiate table extraction pipeline
pipe = TableExtractionPipeline(
    det_config_path='src/detection_config.json', det_device='cpu',
    det_model_path='pubtables1m_detection_detr_r18.pth',
    str_config_path='src/structure_config.json', str_device='cpu',
    str_model_path='TATR-v1.1-All-msft.pth')

In [None]:
# Create a directory with needed subfolders
try:
    for subdir in subdirectories:
        os.makedirs(f"./{directory}/{subdir}", exist_ok=False)
    print(f"Directory {directory} created successfully with subfolders")
except OSError:
    print(f"Directory {directory} can not be created")

In [None]:
# Get list of all pdf files
pdf_files = list(Path("  ").glob('*.pdf'))

In [None]:
# Iterate over all PDF files
for pdf in tqdm(pdf_files):
    # Initialize empty strings for processed and unprocessed text
    text_processed = ''
    text_auto = ''
    
    # Extract the filename from the pdf path
    filename = pdf.stem
    
    # Get all pages from the current PDF file
    PAGES = ocr_utils.get_pages(pdf)
    
    # Iterate over all pages
    for page_num, page in enumerate(tqdm(PAGES)):
        page_num = page_num+1
        try:
            # Correct the orientation and skew of the page
            PAGE = ocr_utils.orientation_and_deskew(page)
        except Exception as e:
            # Skip the current page if an error occurs and print the error message
            print(f"Skipping {page_num} due to {e}")
            continue
            
        with PyTessBaseAPI(lang='eng', psm=PSM.AUTO, oem=OEM.DEFAULT) as api:
            # Process the page and extract text
            processed = ocr_utils.process_text_page(PAGE)
            api.SetImage(processed)
            api.Recognize()
            next_text_processed = api.GetUTF8Text()
            
            # Add the extracted text to the processed text string
            text_processed += f"\n\n PAGE: {page_num} \n\n {next_text_processed}"
            
        with PyTessBaseAPI(lang='eng', psm=PSM.AUTO, oem=OEM.DEFAULT) as api:
            # Extract text from the unprocessed page
            api.SetImage(PAGE)
            api.Recognize()
            next_text = api.GetUTF8Text()
            
            # Add the extracted text to the unprocessed text string
            text_auto += f"\n\n PAGE: {page_num} \n\n {next_text}"

        # Convert the image to a list of words
        toks = ocr_utils.ocr_image_to_word_list(PAGE)
        # Detect tables in the page
        det_tab = pipe.detect(PAGE, tokens=toks, out_crops=True, out_objects=True)
        # If tables are detected in the page
        if det_tab['crops']:
            for i, crop in enumerate(det_tab['crops']):
                # Convert the table image to a list of words
                table_tokens = ocr_utils.ocr_image_to_word_list(crop['image'])
                # Recognize the table and extract its contents
                extracted_table = pipe.recognize(crop['image'], table_tokens, out_objects=True, out_cells=True, out_csv=True, out_html=True)
                html_table = io.StringIO(extracted_table['html'][0])
                try:
                    # Convert the HTML table to a pandas DataFrame
                    table_df = pd.read_html(html_table)
                except:
                    continue
                table = table_df[0]
                # Save the table as a CSV file
                table.to_csv(f'{directory}/csvs/{filename}_page_{page_num}_table_{i}.csv', index=False)
                
    # Write the unprocessed text to a file
    with open(f"{directory}/txts/{filename}.txt", 'w', encoding='utf-8') as f:
        f.write(text_auto)

    # Write the processed text to a file
    with open(f"{directory}/processed_txts/{filename}.txt", 'w', encoding='utf-8') as f:
        f.write(text_processed)