In [3]:
import re

file_path = 'Book/Kafka_Lahire.txt'

# Read the input file
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Remove isolated footnote markers like [1], [2]
text = re.sub(r"\[\d+\] ?↑?.*?(\n|$)", "", text)

# Remove footnotes of the format "1. Some text" at the end of each chapter
text = re.sub(r"(\n|^)\d+\.\s+.+?(?=\n\n|\Z)", "", text, flags=re.DOTALL)

# Write the cleaned text back to a file
with open(file_path, 'w', encoding='utf-8') as f:
    f.write(text)


In [1]:
import random
import openai
from semantic_text_splitter import TextSplitter
import json
import os
from dotenv import load_dotenv


def split_and_merge_text(text, min_size=1500, max_size=3000):
    """
    Splits the text using a TextSplitter and merges smaller chunks to meet size constraints.

    Args:
        text (str): The input text to be split.
        min_size (int): The minimum size for each chunk.
        max_size (int): The maximum size for each chunk.

    Returns:
        list: A list of merged chunks that meet the size constraints.
    """
    # Initialize the splitter with the chunk size range
    splitter = TextSplitter((min_size, max_size))

    # Split the text into initial chunks
    initial_chunks = splitter.chunks(text)

    # Merge chunks smaller than the minimum size
    merged_chunks = []
    current_chunk = ""
    for chunk in initial_chunks:
        if len(current_chunk) + len(chunk) <= max_size:
            current_chunk += chunk
        else:
            if current_chunk:
                merged_chunks.append(current_chunk)
            current_chunk = chunk

    if current_chunk:
        merged_chunks.append(current_chunk)

    # Ensure all chunks meet the minimum size requirement
    final_chunks = []
    current_chunk = ""
    for chunk in merged_chunks:
        if len(chunk) < min_size:
            current_chunk += chunk
        else:
            if current_chunk:
                final_chunks.append(current_chunk)
                current_chunk = ""
            final_chunks.append(chunk)

    if current_chunk:
        final_chunks.append(current_chunk)

    return final_chunks


ModuleNotFoundError: No module named 'typing_extensions'