In [None]:
import numpy as np
import pandas as pd
import pdfplumber as pp
import time
import re
import os

Prepare paths to files. All files in statement_folder (and its subfolders) must be bank account statements, or alternatively, the if statement adjusted to filter the file names.

In [None]:
statement_folder = "/home/user1/Documents/DKB/Kontoauszuege"
paths = list()
account_nr = "########" # Secret :)
for (dirpath, dirnames, filenames) in os.walk(statement_folder):
    paths += [
        os.path.join(dirpath, file) for file in filenames
        if account_nr in file
    ]

Open all files and extract entries

In [None]:
all_entries = []
for p in paths:
    full_text = ""
    cropped_text = ""
    with pp.open(p) as pdf:
        # Store table contents of all pdf pages as text.
        # First for full pages
        for page in pdf.pages:
            # Text of full page:
            page_text = page.extract_text()
            # Find the year
            year = page_text.split(" / ", 1)[1].split(" ", 1)[0]
            # Extract the text in tables
            if (("Gutschrift in EUR" in page_text) and
                    "Gutschrift in EUR\nALTER KONTOSTAND" not in page_text):
                page_text = page_text.split("Gutschrift in EUR\n", 1)[1]
                if "ALTER KONTOSTAND" in page_text:
                    page_text = page_text.split("\nALTER KONTOSTAND")[0]
                    full_text = full_text + "\n" + page_text
                    break
                else:
                    page_text = page_text.split("\nDeutsche Kreditbank AG")[0]
                    full_text = full_text + "\n" + page_text
            else:
                break
        # And secondly for cropped pages, where the same page is read again, but the
        # last 20% are cropped out so that it is possible to determine if values are in the
        # last column of a table or not (depending on whether they appear in both reads)
        for page in pdf.pages:
            # Text of cropped page:
            cropped_page_text = page.crop((0, 0, int(int(page.width) * 0.8),
                                           page.height)).extract_text()
            # Extract the text in tables
            if (("Belastung in EUR" in cropped_page_text)
                    and "Belastung in EUR\nALTER KONTOSTAND" not in
                    cropped_page_text):
                cropped_page_text = cropped_page_text.split(
                    "Belastung in EUR\n", 1)[1]
                if "ALTER KONTOSTAND" in cropped_page_text:
                    cropped_page_text = cropped_page_text.split(
                        "\nALTER KONTOSTAND")[0]
                    cropped_text = cropped_text + "\n" + cropped_page_text
                    break
                else:
                    cropped_page_text = cropped_page_text.split(
                        "\nDeutsche Kreditbank AG")[0]
                    cropped_text = cropped_text + "\n" + cropped_page_text
            else:
                break
    # Split full text and cropped text into lines
    lines_full = full_text.split("\n")
    lines_cropped = cropped_text.split("\n")
    # Merge lines that are continuation of previous line
    for i, l in enumerate(lines_full[1:], 1):
        regexp = re.compile('\d{2}\.\d{2}\. \d{2}\.\d{2}\.')
        if regexp.match(l) is None:
            counter = i - 1
            previous_line = lines_full[counter]
            while previous_line == "":
                counter -= 1
                previous_line = lines_full[counter]
            blocks = lines_full[counter].split(" ")
            lines_full[counter] = " ".join(blocks[0:-1] + [lines_full[i]] +
                                           [blocks[-1]])
            lines_full[i] = ""
        else:
            if lines_full[i] == lines_cropped[i]:
                blocks = l.split(" ")
                lines_full[i] = " ".join(blocks[:-1] + ["-" + blocks[-1]])
    # Remove empty lines
    lines_full = [l for l in lines_full if l != '']
    # Add year to line
    for i, l in enumerate(lines_full):
        blocks = l.split(" ")
        lines_full[i] = " ".join([blocks[0] + year[2:]] +
                                 [blocks[1] + year[2:]] + blocks[2:])
    all_entries = all_entries + lines_full

Write entries into DataFrame

In [None]:
# Create DataFrame
table = pd.DataFrame(columns=["Date", "Subject", "Amount"])
# Write entries into DataFrame
for e in all_entries:
    blocks = e.split(" ")
    table.loc[len(table) + 1] = [blocks[0], " ".join(blocks[2:-1]), blocks[-1]]
# Format entries correctly
table.Date = pd.to_datetime(table.Date, format="%d.%m.%y")
table.Amount = table.Amount.apply(
    lambda x: float(x.replace(".", "").replace(",", ".")))
table.sort_values(by="Date", inplace=True)
table.reset_index(inplace=True, drop=True)
# Display result
table

Have a quick look into the data:

In [None]:
table.info()

In [None]:
table.plot("Date", "Amount")

Store the dataframe for further analysis

In [None]:
table.to_pickle("statement_data_raw.pkl")