In [5]:
import pdfplumber
import pandas as pd
import re

def extract_operations_from_pdf(pdf_path, output_csv_path, year):
    lines = []

    # Extract all lines that contain transaction-like patterns
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                for line in text.split('\n'):
                    # Match lines starting with something like: 06.05 07.05 Card ...
                    if re.match(r"\d{2}\.\d{2}\s+\d{2}\.\d{2}", line.strip()):
                        lines.append(line.strip())

    # Clean and structure the extracted lines
    data = []
    for line in lines:
        parts = line.split()
        if len(parts) >= 5:
            date_op = parts[0]
            date_val = parts[1]

            # Last element = amount
            amount_str = parts[-1].replace(",", ".").replace("€", "")
            try:
                amount = float(amount_str)
            except ValueError:
                amount = None

            # Description = everything between the dates and the amount
            description = " ".join(parts[2:-1])
            data.append({
                "date": f"{date_val}.{year}",
                "description": description,
                "amount": -amount if amount else None  # treat all as debit for now
            })

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Convert date column to datetime
    df["date"] = pd.to_datetime(df["date"], format="%d.%m.%Y", errors="coerce")

    # Drop rows with missing or invalid dates or amounts
    df = df.dropna(subset=["date", "amount"])

    # Export to CSV
    df.to_csv(output_csv_path, index=False, encoding="utf-8-sig")
    print(f"✅ CSV file exported: {output_csv_path}")


# === Run manually for two PDFs ===
if __name__ == "__main__":
    extract_operations_from_pdf(
        pdf_path=r"C:\Users\basti\Desktop\Bank_analysis\data\Mai2025.pdf",
        output_csv_path=r"C:\Users\basti\Desktop\Bank_analysis\data\Mai2025_final.csv",
        year=2025
    )

    extract_operations_from_pdf(
        pdf_path=r"C:\Users\basti\Desktop\Bank_analysis\data\Juin2025.pdf",
        output_csv_path=r"C:\Users\basti\Desktop\Bank_analysis\data\Juin2025_final.csv",
        year=2025
    )


✅ CSV file exported: C:\Users\basti\Desktop\Bank_analysis\data\Mai2025_final.csv
✅ CSV file exported: C:\Users\basti\Desktop\Bank_analysis\data\Juin2025_final.csv
