In [1]:
import os
import json
from PyPDF2 import PdfReader
import fitz
import pdfplumber
from pdf2image import convert_from_path
import re

In [2]:
def clean_text(string):
    cleaned_s = re.sub(r'[\x00-\x1F\x7F]', ' ', string)
    return cleaned_s

In [3]:
def is_garbled(text):
    """Detects garbled text output from PDFs."""
    # Check for (cid:xxx) patterns
    if re.search(r"\(cid:\d+\)", text):
        return True

    # Check for excessive non-ASCII characters
    non_ascii_ratio = sum(1 for char in text if ord(char) > 127) / max(len(text), 1)
    if non_ascii_ratio > 0.3:  # Adjust threshold as needed
        return True

    return False

In [4]:
def glued_text(text, long_word_threshold=9, glue_ratio_threshold=0.5):
    """
    Detects if a text has excessive glued words (missing spaces).

    Args:
        text (str): The extracted text.
        long_word_threshold (int): Word length above which words are considered suspicious.
        glue_ratio_threshold (float): Threshold ratio of suspicious words for flagging text.

    Returns:
        bool: True if text likely contains glued words, False otherwise.
    """

    words = text.split()
    total_words = len(words)

    # If there's no text or no words, we can't meaningfully flag it
    if total_words == 0:
        return False

    suspicious_words = []
    for w in words:
        # 1) Very long words
        if len(w) >= long_word_threshold:
            suspicious_words.append(w)
            continue

        # 2) Lowercase-to-Uppercase transition inside the word
        if re.search(r'[a-z][A-Z]', w):
            suspicious_words.append(w)
            continue

        # 3) Punctuation immediately followed by a letter (no space)
        if re.search(r'[.,;:!?][A-Za-z]', w):
            suspicious_words.append(w)
            continue

    # Ratio of suspicious words to total words
    suspicious_ratio = len(suspicious_words) / total_words

    return suspicious_ratio > glue_ratio_threshold

In [5]:
def is_spaced_out(line: str, min_ratio=0.5) -> bool:
    """
    Returns True if the line looks like it's "spaced-out" text, e.g. 'T a b l e  o f  C o n t e n t s'
    :param line: The text line to examine
    :param min_ratio: If single-letter tokens exceed this fraction of total tokens,
                      we treat it as spaced-out text.
    """
    tokens = line.split()
    if not tokens:
        return False

    # Count how many tokens are single letters
    single_letter_count = sum(1 for tok in tokens if len(tok) == 1)
    ratio = single_letter_count / len(tokens)
    return ratio >= min_ratio

def fix_spaced_line(line: str) -> str:
    """
    Converts spaced-out sequences like 'T a b l e' into 'Table',
    but treats any sequence of >=2 spaces as a boundary between words.
    Example:
      "T a b l e  o f  C o n t e n t s  P E N G U I N  C L A S S I C S"
      becomes "Table of Contents PENGUIN CLASSICS"
    """
    # 1) Split the line on double-or-more spaces, capturing each chunk
    #    For instance, with "T a b l e  o f  C o n t e n t s  P E N G U I N"
    #    we get segments ["T a b l e", "o f", "C o n t e n t s", "P E N G U I N", "C L A S S I C S"]
    segments = re.split(r"\s{2,}", line.strip())

    fixed_words = []
    for seg in segments:
        # 2) Within each segment, we might have single-letter tokens
        #    separated by single spaces. So we gather them and form one word.
        #    "T a b l e" => ["T","a","b","l","e"] => "Table"
        #    BUT if the segment already has multi-letter tokens, we keep them as is.

        tokens = seg.split()

        # Check if they're all single letters. If yes, merge them into one word.
        if all(len(t) == 1 for t in tokens):
            merged = "".join(tokens)  # e.g. "T a b l e" => "Table"
            fixed_words.append(merged)
        else:
            # If there's a mix (or multi-letter tokens),
            # we can do a more nuanced approach:
            #   - accumulate consecutive single letters,
            #   - keep multi-letter tokens as is.
            new_sublist = []
            temp_cluster = []

            for t in tokens:
                if len(t) == 1:
                    temp_cluster.append(t)
                else:
                    # if we have a cluster accumulated, join them first
                    if temp_cluster:
                        new_sublist.append("".join(temp_cluster))
                        temp_cluster = []
                    new_sublist.append(t)

            # If ended with single-letter cluster
            if temp_cluster:
                new_sublist.append("".join(temp_cluster))

            # Now new_sublist might be e.g. ["This", "Table", "has", "MultiWord", "stuff"]
            # We'll rejoin with a single space because these are separate words.
            fixed_words.append(" ".join(new_sublist))

    # 3) Finally, rejoin *segments* with a single space.
    #    So we get e.g. "Table of Contents PENGUIN CLASSICS".
    return " ".join(fixed_words)


In [6]:
def split_pages(text, min_words=400):
    """Splits text into pages at paragraph breaks if they exceed min_words."""
    pages = []
    current_page_lines = []
    current_word_count = 0
    threshold_reached = False

    for line in text.splitlines():
        current_page_lines.append(line)
        current_word_count += len(line.split())

        if current_word_count >= min_words:
            threshold_reached = True

        if threshold_reached and not line.strip():
            page_text = " ".join(current_page_lines).strip()
            page_text = clean_text(page_text)
            if page_text:
                pages.append(page_text)
            current_page_lines = []
            current_word_count = 0
            threshold_reached = False

    if current_page_lines:
        page_text = "".join(current_page_lines).strip()
        if page_text:
            pages.append(page_text)

    return pages

def extract_pdf_text(file_path):
    """Extracts text from a PDF page-by-page."""

    text_pages = []
    glued_count = 0
    garbage_count = 0

    doc = fitz.open(file_path)
    num_pages = len(doc)  # Get the total number of pages
    all_text_empty = True
    garbage = False
    glued = False

    for page_num in range(num_pages):  # FIX: Use range(num_pages)
        page = doc[page_num]
        text = page.get_text("text")

        if not text.strip():
            text_pages.append("")  # Ensure page count matches index
            continue

        # Clean text
        text = clean_text(text)
        if is_spaced_out(text):
            text = fix_spaced_line(text)

        # Check if problematic text
        glued = glued_text(text)
        garbage = is_garbled(text)

        if glued:
            glued_count += 1
        else:
            glued_count = 0

        if garbage:
            garbage_count += 1
        else:
            garbage_count = 0

        if glued or garbage:
            text_pages.append("")
        else:
            text_pages.append(text.strip())
            all_text_empty = False

        # Stop processing if too many bad pages
        if glued_count >= 5 or garbage_count >= 5:
            break

    doc.close()

    return text_pages, all_text_empty, (garbage_count > 0), (glued_count > 0)


In [7]:
def get_book_info(filename):
    try:
        filename_list = filename.split('_')
        author = filename_list[0]
        title = filename_list[1]
        year = filename_list[2]

        return author, title, year
    except:
        return "unknown", filename, None

In [8]:
def create_books_dict(folder_path, output_json_path, min_words=400):
    """
    Processes a folder of files and splits them into pages, handling:
      - .txt files with paragraph splitting
      - .pdf files (detecting digital text or applying OCR for scanned/printed PDFs)
      - Skips unsupported files

    Outputs a dictionary:
    {
        "filename": {
            "author": "default_author",
            "pages": [...]
        },
        ...
    }

    Also saves this dictionary to a JSON file.
    """
    compiled_books = {}

    ocr_pile = []
    error_pile = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)

        print()
        print(f"Processing {filename}...")
        try:
            if os.path.isdir(file_path):
                continue

            book_info, ext = os.path.splitext(filename)
            author, title, year = get_book_info(book_info)

            if ext.lower() == ".txt":
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                pages = split_pages(text, min_words=min_words)

                compiled_books[book_info] = {
                    "title": title,
                    "author": author,
                    "year": year,
                    "pages": pages
                }


            elif ext.lower() == ".pdf":
                pages, empty, garbage, glued = extract_pdf_text(file_path)
                if garbage:
                    print(f"Garbled text detected in {filename}.")
                    ocr_pile.append(file_path)
                elif glued:
                    print(f"Glued text detected in {filename}.")
                    ocr_pile.append(file_path)
                elif empty:
                    print(f"Empty text detected in {filename}.")
                    ocr_pile.append(file_path)
                else:
                    compiled_books[book_info] = {
                        "title": title,
                        "author": author,
                        "year": year,
                        "pages": pages
                    }


            else:
                print(f"Skipping unsupported file type: {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            error_pile.append(file_path)

    # Save to JSON
    with open(output_json_path, 'w', encoding='utf-8') as json_file:
        json.dump(compiled_books, json_file, indent=2, ensure_ascii=False)

    return compiled_books, ocr_pile, error_pile

In [9]:
books_dict, ocr_pile, error_pile = create_books_dict(
    folder_path="data",
    output_json_path="/home/jeanluca/code/JeanLucaSchindler/FUSE/FUSE_Module0a/Preproc/test.json",
    min_words=400
)

# time for 100 books: 12m 30s


Processing Pierre Bourdieu_Distinction, A Social Critique of the Judgement of Taste_1979 (1).pdf...
Garbled text detected in Pierre Bourdieu_Distinction, A Social Critique of the Judgement of Taste_1979 (1).pdf.

Processing Marianne Moore_Complete Poems_1967.pdf...

Processing Iris Murdoch_The Sea the Sea_1978.pdf...

Processing Xenophon_Anabasis_370 BC.txt...

Processing Mary Wollstonecraft_A Vindication of the Rights of Women_1792.pdf...

Processing Henrik Ibsen_Hedda Gabler_1890.txt...

Processing Yukio Mishima_The Sailor Who Fell From Grace with the Sea_1963.pdf...


In [10]:
len(books_dict), len(ocr_pile), len(error_pile)

(6, 1, 0)

In [22]:
i = 13
books_dict["Marianne Moore_Complete Poems_1967"]['pages'][i-1]

'A NOTE ON THE TEXT The text conforms as closely as is now possible to the author’s final intentions. Five of the poems written after the first printing of this volume have been included. Late authorized corrections, and earlier corrections authorized but not made, have been incorporated. Punctuation, hyphens, and line arrangements silently changed by editor, proofreader, or typesetter have been restored. Misleading editorial amplifications of the notes have been removed. Clive Driver OceanofPDF.com'

# Adjust json to mongodb

In [None]:
def adjust_json_to_mongodb(dict_style_json, replacement):
    """
    Converts dict-style JSON to list-style for MongoDB compatibility,
    and replaces empty strings in 'pages' with a single space.

    Parameters:
        dict_style_json (dict): Dictionary-style JSON (keys as filenames or book IDs)

    Returns:
        list: List-style JSON suitable for MongoDB insertion
    """
    mongo_ready = []

    for book_id, book_data in dict_style_json.items():
        # Copy to avoid mutating original
        book = book_data.copy()

        # Ensure 'title' exists (if you want to keep the original dict key, you can add it as a field)
        if "title" not in book:
            book["title"] = book_id

        # Sanitize pages
        if "pages" in book and isinstance(book["pages"], list):
            book["pages"] = [p if p.strip() else replacement for p in book["pages"]]

        mongo_ready.append(book)

    return mongo_ready


In [None]:
book_list = adjust_json_to_mongodb(books_dict, None)

with open("test_list.json", "w", encoding="utf-8") as f:
    json.dump(book_list, f, indent=2, ensure_ascii=False)