In [33]:
import os
import json
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import pytesseract
import re
import random

In [34]:
def clean_text(string):
    cleaned_s = re.sub(r'[\x00-\x1F\x7F]', '', string)
    return cleaned_s

In [35]:
def is_garbled(text):
    """Detects garbled text output from PDFs."""
    # Check for (cid:xxx) patterns
    if re.search(r"\(cid:\d+\)", text):
        return True

    # Check for excessive non-ASCII characters
    non_ascii_ratio = sum(1 for char in text if ord(char) > 127) / max(len(text), 1)
    if non_ascii_ratio > 0.3:  # Adjust threshold as needed
        return True

    return False

In [36]:
def glued_text(text, long_word_threshold=30, glue_ratio_threshold=0.5):
    """
    Detects if a text has excessive glued words (missing spaces).

    Args:
        text (str): The extracted text.
        long_word_threshold (int): Word length above which words are considered suspicious.
        glue_ratio_threshold (float): Threshold ratio of glued words for flagging text.

    Returns:
        bool: True if text likely contains glued words, False otherwise.
    """

    words = text.split()
    total_words = len(words)

    if total_words == 0:
        return False  # Empty text is not glued

    # Count long words
    long_words = [w for w in words if len(w) >= long_word_threshold]

    # Count capitalized words glued together (e.g., "TokyoImperialUniversity")
    glued_caps_words = [w for w in words if re.search(r'[a-z][A-Z]', w)]

    # Calculate glue ratio
    glue_ratio = len(long_words) / total_words
    caps_glue_ratio = len(glued_caps_words) / total_words

    # If too many long words or glued capital words, flag as glued text
    if glue_ratio > glue_ratio_threshold or caps_glue_ratio > glue_ratio_threshold:
        return True

    return False

In [37]:
def split_pages(text, min_words=400):
    """Splits text into pages at paragraph breaks if they exceed min_words."""
    pages = []
    current_page_lines = []
    current_word_count = 0
    threshold_reached = False

    for line in text.splitlines():
        current_page_lines.append(line)
        current_word_count += len(line.split())

        if current_word_count >= min_words:
            threshold_reached = True

        if threshold_reached and not line.strip():
            page_text = " ".join(current_page_lines).strip()
            page_text = clean_text(page_text)
            if page_text:
                pages.append(page_text)
            current_page_lines = []
            current_word_count = 0
            threshold_reached = False

    if current_page_lines:
        page_text = "".join(current_page_lines).strip()
        if page_text:
            pages.append(page_text)

    return pages

def extract_pdf_text(file_path):
    """Extracts text from a PDF page-by-page."""

    text_pages = []

    with open(file_path, "rb") as file:
        reader = PdfReader(file)
        all_text_empty = True
        garbage = False
        glued = False

        for page in reader.pages:
            text = page.extract_text()
            text = clean_text(text)
            glued =  glued_text(text)
            garbage = is_garbled(text)
            if garbage or glued:
                break
            if text and text.strip():
                all_text_empty = False
                text_pages.append(text.strip())
            else:
                text_pages.append("")  # placeholder for now

    return text_pages, all_text_empty, garbage, glued

In [38]:
def get_book_info(filename):
    try:
        filename_list = filename.split('_')
        author = filename_list[0]
        title = filename_list[1]
        year = filename_list[2]

        return author, title, year
    except:
        return "unknown", filename, None

In [None]:
def create_books_dict(folder_path, output_json_path, min_words=400):
    """
    Processes a folder of files and splits them into pages, handling:
      - .txt files with paragraph splitting
      - .pdf files (detecting digital text or applying OCR for scanned/printed PDFs)
      - Skips unsupported files

    Outputs a dictionary:
    {
        "filename": {
            "author": "default_author",
            "pages": [...]
        },
        ...
    }

    Also saves this dictionary to a JSON file.
    """
    compiled_books = {}

    ocr_pile = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        print()
        print(f"Processing {filename}...")

        if os.path.isdir(file_path):
            continue

        book_info, ext = os.path.splitext(filename)
        author, title, year = get_book_info(book_info)

        if ext.lower() == ".txt":
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
            pages = split_pages(text, min_words=min_words)

            compiled_books[title] = {
                "author": author,
                "year": year,
                "pages": pages
            }

        elif ext.lower() == ".pdf":
            pages, empty, garbage, glued = extract_pdf_text(file_path)
            if empty:
                print(f"Empty text detected in {filename}.")
                ocr_pile.append(file_path)
            elif garbage:
                print(f"Garbled text detected in {filename}.")
                ocr_pile.append(file_path)
            elif glued:
                print(f"Glued text detected in {filename}.")
                ocr_pile.append(file_path)
            else:
                compiled_books[title] = {
                    "author": author,
                    "year": year,
                    "pages": pages
                }

        else:
            print(f"Skipping unsupported file type: {filename}")

    # Save to JSON
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(compiled_books, json_file, indent=2, ensure_ascii=False)

    return compiled_books, ocr_pile

In [40]:
books_dict, ocr_pile = create_books_dict(
    folder_path="/home/jeanluca/code/JeanLucaSchindler/FUSE/FUSE_Module0a/Preproc/data",
    output_json_path="/home/jeanluca/code/JeanLucaSchindler/FUSE/FUSE_Module0a/Preproc/test.json",
    min_words=400
)


Processing Claude Levi-Strauss_Tristes_Tropiques_1955.pdf...

Processing Pierre Bourdieu_Distinction, A Social Critique of the Judgement of Taste_1979 (1).pdf...
Garbled text detected in Pierre Bourdieu_Distinction, A Social Critique of the Judgement of Taste_1979 (1).pdf.

Processing Marianne Moore_Complete Poems_1967.pdf...

Processing Xenophon_Anabasis_370 BC.txt...

Processing Mary Wollstonecraft_A Vindication of the Rights of Women_1792.pdf...

Processing Accenture CL.pdf...
Empty text detected in Accenture CL.pdf.

Processing Henrik Ibsen_Hedda Gabler_1890.txt...

Processing Yukio Mishima_The Sailor Who Fell From Grace with the Sea_1963.pdf...
Glued text detected in Yukio Mishima_The Sailor Who Fell From Grace with the Sea_1963.pdf.


In [None]:
from pdf2image import convert_from_path
import pytesseract
import concurrent.futures

def ocr_page(image, lang="eng"):
    """Performs OCR on a single image page."""
    return pytesseract.image_to_string(image, lang=lang, config="--psm 6")  # PSM 6 is optimized for dense text

def fast_ocr_pdf_to_text_list(pdf_path, dpi=200, lang="eng", workers=4):
    """
    Efficiently reads a scanned PDF using OCR and returns a list where each element is a string representing a PDF page.

    Args:
        pdf_path (str): Path to the PDF file.
        dpi (int): Lower DPI speeds up processing but may reduce accuracy.
        lang (str): OCR language (default: "eng" for English).
        workers (int): Number of parallel threads for processing.

    Returns:
        List[str]: A list where each element is the text of a corresponding PDF page.
    """
    # Convert PDF pages to images
    images = convert_from_path(pdf_path, dpi=dpi)

    # Run OCR in parallel using ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
        text_per_page = list(executor.map(lambda img: ocr_page(img, lang), images))

    return text_per_page

res = fast_ocr_pdf_to_text_list(ocr_pile[2])
# ran for 4min 20s

MISHIMA. (6)e Sailor Who Fell from Grace with the SeaNow a disturbing and erotic film starring Sarah Miles and Kris Kristofferson. aeye vs < 7/ Seayh vy ‘ -— . aee My s a————," - 3


In [50]:
print(clean_text(res[6]))

Summerimitation damask of the couch gave the room an air ofagitation. She must have noticed a ladder on her way outand changed in a hurry.Only dazzling sky and a few fragments of cloud, hardand glossy as enamel in the light bouncing off the water,could be seen through the window.Noboru couldn’t believe he was looking at his mother’sbedroom; it might have belonged to a stranger. But therewas no doubt that a woman lived there: femininitytrembled in every corner, a faint scent lingered in theair.Then a strange idea assailed him. Did the peephole justhappen to be here, an accident? Or — after the war —when the soldiers’ families had been living together in thehouse . . . He had a sudden feeling that another body,larger than his, a blond, hairy body, had once huddled inthis dusty space in the wall. The thought soured the closeair and he was sickened. Wriggling backwards out of thechest, he ran to the next room. He would never forget thequeer sensation he had when, flinging open the door, he