In [None]:

# Verzeichnisstruktur
DATA_DIR = "data"
INPUT_DIR = os.path.join(DATA_DIR, "input")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
LOG_FILE = os.path.join(DATA_DIR, "error_log.txt")

# Funktion zur Verarbeitung eines einzelnen PDFs
def process_pdf(file_path):
    try:
        text = extract_text(file_path)
        return text
    except Exception as e:
        with open(LOG_FILE, "a") as log_file:
            log_file.write(f"Fehler bei {file_path}: {e}\n")
        return None

# Funktion zur Speicherung als JSON
def save_as_json(output_file, text, file_path):
    data = {
        "file_name": os.path.basename(file_path),
        "file_path": file_path,
        "text": text
    }
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

# Funktion zur Verarbeitung aller PDFs in einer rekursiven Verzeichnisstruktur
def process_all_pdfs_recursive(input_dir=INPUT_DIR, output_dir=PROCESSED_DIR):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Fehlerlog zurücksetzen
    if os.path.exists(LOG_FILE):
        os.remove(LOG_FILE)

    pdf_files = []
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append((root, file))

    # Fortschrittsanzeige mit tqdm
    for root, file in tqdm(pdf_files, desc="PDFs verarbeiten"):
        file_path = os.path.join(root, file)

        # Zielpfad basierend auf der Ordnerstruktur
        relative_path = os.path.relpath(root, input_dir)
        target_dir = os.path.join(output_dir, relative_path)

        if not os.path.exists(target_dir):
            os.makedirs(target_dir)

        # PDF-Inhalt extrahieren
        text = process_pdf(file_path)
        if text:
            # Ergebnis als JSON speichern
            output_file = os.path.join(target_dir, file.replace(".pdf", ".json"))
            save_as_json(output_file, text, file_path)

if __name__ == "__main__":
    process_all_pdfs_recursive()

In [None]:
react_chunks = split_documents(react_docs)

In [None]:
def split_documents(documents, chunk_size=800, chunk_overlap=80):
    """
    this function splits documents into chunks of given size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents=documents)
    return chunks