In [None]:
import os
import json
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
import re
import random

In [None]:
def clean_text(string):
    cleaned_s = re.sub(r'[\x00-\x1F\x7F]', '', string)
    return cleaned_s

In [None]:
def is_garbled(text):
    """Detects garbled text output from PDFs."""
    # Check for (cid:xxx) patterns
    if re.search(r"\(cid:\d+\)", text):
        return True

    # Check for excessive non-ASCII characters
    non_ascii_ratio = sum(1 for char in text if ord(char) > 127) / max(len(text), 1)
    if non_ascii_ratio > 0.3:  # Adjust threshold as needed
        return True

    return False

In [None]:
def glued_text(text, long_word_threshold=30, glue_ratio_threshold=0.5):
    """
    Detects if a text has excessive glued words (missing spaces).

    Args:
        text (str): The extracted text.
        long_word_threshold (int): Word length above which words are considered suspicious.
        glue_ratio_threshold (float): Threshold ratio of suspicious words for flagging text.

    Returns:
        bool: True if text likely contains glued words, False otherwise.
    """

    words = text.split()
    total_words = len(words)

    # If there's no text or no words, we can't meaningfully flag it
    if total_words == 0:
        return False

    suspicious_words = []
    for w in words:
        # 1) Very long words
        if len(w) >= long_word_threshold:
            suspicious_words.append(w)
            continue

        # 2) Lowercase-to-Uppercase transition inside the word
        if re.search(r'[a-z][A-Z]', w):
            suspicious_words.append(w)
            continue

        # 3) Punctuation immediately followed by a letter (no space)
        if re.search(r'[.,;:!?][A-Za-z]', w):
            suspicious_words.append(w)
            continue

    # Ratio of suspicious words to total words
    suspicious_ratio = len(suspicious_words) / total_words

    return suspicious_ratio > glue_ratio_threshold

In [None]:
def spaced_text(text, short_word_threshold=1, spacing_ratio_threshold=0.5):
    """
    Detects if a text likely has excessive spacing (e.g., a space between every character).

    Args:
        text (str): The extracted text.
        short_word_threshold (int): Word length at or below which words are considered suspiciously short.
        spacing_ratio_threshold (float): Threshold ratio of suspiciously short words for flagging text.

    Returns:
        bool: True if text likely contains excessive spacing, False otherwise.
    """
    # Split text on whitespace to get words
    words = text.split()
    total_words = len(words)

    # If there are no words, we can't do much analysis
    if total_words == 0:
        return False

    # Find all words at or below the short_word_threshold (e.g., length <= 1)
    short_words = [w for w in words if len(w) <= short_word_threshold]

    # Ratio of short words to total words
    short_words_ratio = len(short_words) / total_words

    # If the ratio of short (single-char) words is above the specified threshold,
    # consider the text "excessively spaced"
    if short_words_ratio > spacing_ratio_threshold:
        return True

    return False

In [None]:
def split_pages(text, min_words=400):
    """Splits text into pages at paragraph breaks if they exceed min_words."""
    pages = []
    current_page_lines = []
    current_word_count = 0
    threshold_reached = False

    for line in text.splitlines():
        current_page_lines.append(line)
        current_word_count += len(line.split())

        if current_word_count >= min_words:
            threshold_reached = True

        if threshold_reached and not line.strip():
            page_text = " ".join(current_page_lines).strip()
            page_text = clean_text(page_text)
            if page_text:
                pages.append(page_text)
            current_page_lines = []
            current_word_count = 0
            threshold_reached = False

    if current_page_lines:
        page_text = "".join(current_page_lines).strip()
        if page_text:
            pages.append(page_text)

    return pages

def extract_pdf_text(file_path):
    """Extracts text from a PDF page-by-page."""

    text_pages = []

    with open(file_path, "rb") as file:
        reader = PdfReader(file)
        all_text_empty = True
        garbage = False
        glued = False

        for page in reader.pages:
            text = page.extract_text()
            text = clean_text(text)

            if spaced_text(text):
                text = text.replace(" ", "")
            if spaced_text(text):
                text = text.replace(" ", "")

            glued =  glued_text(text)
            garbage = is_garbled(text)

            if garbage or glued:
                break
            if text and text.strip():
                all_text_empty = False
                text_pages.append(text.strip())
            else:
                text_pages.append("")  # placeholder for now

    return text_pages, all_text_empty, garbage, glued

In [None]:
def get_book_info(filename):
    try:
        filename_list = filename.split('_')
        author = filename_list[0]
        title = filename_list[1]
        year = filename_list[2]

        return author, title, year
    except:
        return "unknown", filename, None

In [None]:
def create_books_dict(folder_path, output_json_path, min_words=400):
    """
    Processes a folder of files and splits them into pages, handling:
      - .txt files with paragraph splitting
      - .pdf files (detecting digital text or applying OCR for scanned/printed PDFs)
      - Skips unsupported files

    Outputs a dictionary:
    {
        "filename": {
            "author": "default_author",
            "pages": [...]
        },
        ...
    }

    Also saves this dictionary to a JSON file.
    """
    compiled_books = {}

    ocr_pile = []
    error_pile = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        print()
        print(f"Processing {filename}...")
        try:
            if os.path.isdir(file_path):
                continue

            book_info, ext = os.path.splitext(filename)
            author, title, year = get_book_info(book_info)

            if ext.lower() == ".txt":
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                pages = split_pages(text, min_words=min_words)

                compiled_books[title] = {
                    "author": author,
                    "year": year,
                    "pages": pages
                }

            elif ext.lower() == ".pdf":
                pages, empty, garbage, glued = extract_pdf_text(file_path)
                if empty:
                    print(f"Empty text detected in {filename}.")
                    ocr_pile.append(file_path)
                elif garbage:
                    print(f"Garbled text detected in {filename}.")
                    ocr_pile.append(file_path)
                elif glued:
                    print(f"Glued text detected in {filename}.")
                    ocr_pile.append(file_path)
                else:
                    compiled_books[title] = {
                        "author": author,
                        "year": year,
                        "pages": pages
                    }

            else:
                print(f"Skipping unsupported file type: {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            error_pile.append(file_path)

    # Save to JSON
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(compiled_books, json_file, indent=2, ensure_ascii=False)

    return compiled_books, ocr_pile, error_pile

In [None]:
books_dict, ocr_pile, error_pile = create_books_dict(
    folder_path="/mnt/c/Users/jeanl/OneDrive/FUSE/data/livres",
    output_json_path="/home/jeanluca/code/JeanLucaSchindler/FUSE/FUSE_Module0a/Preproc/test.json",
    min_words=400
)

In [84]:
len(books_dict), len(ocr_pile), len(error_pile)

(90, 31, 2)

In [None]:
# estimating how many books in each pile
len(books_dict)*13, len(ocr_pile)*13, len(error_pile)*13

(1170, 403, 26)

In [87]:
OCR_min_per_book = 4
normal_min_per_book = 0.25

print(f'Normal preproc will run for {round(normal_min_per_book*len(books_dict)*13/60)} hours')

print(f'OCR will run for {round(OCR_min_per_book*len(ocr_pile)*13/60)} hours')

Normal preproc will run for 5 hours
OCR will run for 27 hours
