# Combo PDF Extractor

This file's purpose is to extract text from the course syllabi PDFs as accurately as possible into text files for further processing in the project.

Please note that the pytesseract library relies on installation of tesseract which is from the link provided below:

https://github.com/h/pytesseract


### Input:  UniqueCourses folder
### Output:  TextFiles_Combo folder

In [None]:
import os
import pdfplumber
from pdf2image import convert_from_path
from pytesseract import pytesseract
from difflib import get_close_matches

In [None]:
# Set up Tesseract path (if needed)
pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
# Define paths
pdf_folder = './UniqueCourses'      # Folder where the pdfs you need is stored
output_folder = './TextFiles_Combo' # Folder where the text files will be stored

In [None]:
# Tokenization function (simple split by whitespace)
def tokenize(text):
    return text.split()

In [None]:
# Correct token using the closest match from pdfp_text
def correct_token(token, reference_tokens):
    matches = get_close_matches(token, reference_tokens, n=1, cutoff=0.8)  # Adjust cutoff for similarity
    if matches:                                 # If there is at least one token that meets the similarity requirement
        reference_tokens.remove(matches[0])     # Remove first instance of token that is likely the correct version of the typo from our reference text to prevent duplicate matching later on
        return matches[0]                       # Return first instance of token that is likely the correct version of the typo token
    else:
        # !!! Comment one of these two return lines depending on your needs
        return token    # Return original token if no close match is found      (keep possible/likely garbage, good if pytesseract consistently misinterprets a specific token
                                                                                # with a different token that is not similar using this similarity equation)
                                                                                
        #return ""       # Return an empty string if no close match is found     (discard possible/likely garbage, good if pytesseract halucinates a lot of useless garbage
                                                                                # but the typos are consistently similar and easily corrected using this similarity equation)

In [None]:
# Iterate through all PDF files in the folder
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        text_filename = os.path.splitext(filename)[0] + ".txt"
        text_path = os.path.join(output_folder, text_filename)

        full_corrected_text = ""

        with pdfplumber.open(pdf_path) as pdf:
            for page_number, page in enumerate(pdf.pages, start=1):
                # Step 1: Extract pytesseract text
                # This is the resource that is very good at grouping text correctly but generates typos and hallucinations
                image = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)[0]
                pyt_text = pytesseract.image_to_string(image, lang='eng')
                pyt_tokens = tokenize(pyt_text)

                # Step 2: Extract pdfplumber text
                # This is the resource that extracts text almost perfectly but reads everything as single column horizontal
                # with no awareness of alternative text grouping styles
                pdfp_text = page.extract_text()
                if not pdfp_text:
                    print(f"No text found on page {page_number} using pdfplumber.")
                    continue
                pdfp_tokens = tokenize(pdfp_text)

                # Step 3: Sequential token correction
                corrected_tokens = []
                for token in pyt_tokens:                        # Iterate through the tokenized text that is correctly grouped but contains typos
                    if token in pdfp_tokens:                    # If an exact match of the token can be found in the good text extraction, this likely is not an artificial typo
                        corrected_tokens.append(token)          # and should be added to the list of tokens for this file page
                        pdfp_tokens.remove(token)               # Remove matched token in our reference text file to reduce the chance of redundant matches
                    else:                                       # Token is likely an artificial typo
                        corrected_tokens.append(correct_token(token, pdfp_tokens))  # Append the corrected version of the typo token if found, otherwise keep as-is

                # Step 4: Reconstruct corrected text
                full_corrected_text += " ".join(corrected_tokens) + "\n"        # Add cleaned text from page to our running text string

        # Write the final corrected text to a file
        with open(text_path, 'w', encoding='utf-8') as text_file:
            text_file.write(full_corrected_text)                                # Write our pdf extraction to a new text file and store it in out output folder

        print(f"Processed and saved: {text_filename}")