In [None]:
import numpy as np
import pandas as pd
import pdfplumber as pp
import time
import re
import os

Prepare paths to files. All files in cc_folder (and its subdirectories) must be credit card statements.

In [None]:
cc_folder = "/home/user1/Documents/DKB/Kreditkartenabrechnungen"
paths = list()
first_four_credit_card_digits = "####"  # Secret :)
for (dirpath, dirnames, filenames) in os.walk(cc_folder):
    paths += [
        os.path.join(dirpath, file) for file in filenames
        if first_four_credit_card_digits in file
    ]

Open all files and extract entries

In [None]:
all_entries = []
for p in paths:
    text = ""
    with pp.open(p) as pdf:
        # Store table contents of all pdf pages as text
        for page in pdf.pages:
            page_text = page.extract_text()
            page_text = page_text.split("EUR\n", 1)[1]
            page_text = page_text.split("\n", 1)[1]
            if "Neuer Saldo" in page_text:
                page_text = page_text.split("\nNeuer Saldo")[0]
                text = text + "\n" + page_text
                break
            else:
                page_text = page_text.split("\nZwischensumme")[0]
                text = text + "\n" + page_text
    # Split text into lines
    lines = text.split("\n")
    # Discard lines that are continuation of previous line
    # (merging possible but complicated and unnecessary)
    for i, l in enumerate(lines[1:], 1):
        regexp = re.compile('\d{2}\.\d{2}\.\d{2} \d{2}\.\d{2}\.\d{2}')
        if regexp.match(l) is None:
            lines[i] = ""
    lines = [l for l in lines if l != '']
    all_entries = all_entries + lines

Write entries into DataFrame

In [None]:
# Create DataFrame
table = pd.DataFrame(columns=["Date", "Subject", "Amount"])
# Write entries into DataFrame
for e in all_entries:
    blocks = e.split(" ")
    table.loc[len(table) + 1] = [blocks[0], " ".join(blocks[2:-1]), blocks[-1]]
# Format entries correctly
table.Date = pd.to_datetime(table.Date, format="%d.%m.%y")
table.Amount = table.Amount.apply(lambda x: float(x[:-1].replace(
    ".", "").replace(",", ".")) * (1 if x[-1] == "+" else -1))
table.sort_values(by="Date", inplace=True)
table.reset_index(inplace=True, drop=True)
# Display result
table

Have a quick look into the data:

In [None]:
table.info()

In [None]:
table.plot("Date", "Amount", ylim=(-50, 50))

Store the dataframe for further analysis

In [None]:
table.to_pickle("cc_data_raw.pkl")