In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
import copy
import functools
import itertools as it
import operator as op
import re
import sys
import warnings
from decimal import Decimal
from pathlib import Path

import numpy as np
import pandas as pd

%aimport sieve

%matplotlib inline
pd_verbose = pd.option_context(
    "display.max_columns",
    50,
    "display.max_colwidth",
    50,
    "display.expand_frame_repr",
    False,
    "display.max_rows",
    10000,
)

In [None]:
one_cent = Decimal("0.01")


def c(value):
    "Convert `value` to Decimal cents."
    return Decimal(value).quantize(one_cent)

In [None]:
def index_to_date(df, date_col="Date", drop=True, **sort_kwargs):
    """ Reindex by date, sorted by (date, index)
    Explicitly sorting the index guarantees datetime slicing will work. Need to sort multilevel
    to preserve sorted transactions per-day.
    """
    tmp = (df.reset_index().set_index(
        [date_col,
         "index"]).sort_index(**sort_kwargs).reset_index(level="index"))
    if not (np.diff(tmp["index"]) > 0).all():
        print(
            "Warning: transaction index sorted by date is not monotonic increasing"
        )
    if drop:
        tmp.drop(columns="index", inplace=True)
    return tmp

In [None]:
def reduce_description(df, deets, cols=None):
    if cols is None:
        cols = ["In", "Out"]
    dat = pd.concat(
        [
            pd.DataFrame(df[df["Description"] == d][cols].sum()).transpose()
            for d in deets
        ],
        join="inner",
        ignore_index=True,
    )
    dat.insert(len(cols), "Description", deets)
    dat.sort_values(by=cols, ascending=False, inplace=True)
    return dat

In [None]:
demodf = pd.read_csv("5000_bank_rec_sample.csv",
                     parse_dates=["Date"]).rename(columns={
                         "Deposits": "In",
                         "Withdrawls": "Out"
                     })

demodf = index_to_date(demodf).reset_index()

quantcols = ["In", "Out", "Balance"]
demodf[quantcols] = (demodf[quantcols].replace(",", "", regex=True).replace(
    r"\s+", np.nan, regex=True).astype(float).fillna(0).applymap(c))
with pd_verbose:
    display(demodf)

In [None]:
deets = list(set(demodf["Description"]))
reduce_description(demodf, deets)

In [None]:
tree = sieve.Sieve(demodf)

tree = tree.extend([("select", "index == [352, 403]")])

tree = tree.extend((
    ("gt", 'In > 1e6 | Out > 1e6'),
    ("str_isin", "Description == [\"ATM\", \"Cash\"]"),
    *((k,
       lambda df, patt=patt: df["Description"].str.contains(patt, case=False))
      for k, patt in (
          ("contains_debit", "debit"),
          ("contains_misc", "misc"),
      )),
))

patts = ((k, re.compile(s, re.IGNORECASE))
         for k, s in (("patt_trans", "trans"), ))
demo_dates = [
    "2020/" + s
    for s in ("09/15", "10/12", "11/21", "08/20", "09/09", "10/30", "12/10")
]
datelims = ["2020-11-10", "2021-01-15"]

tree = tree.branch(
    (
        *((k, lambda df, patt=patt: df["Description"].str.contains(patt))
          for k, patt in patts),
        (
            "date_isin",
            lambda df: df["Date"].dt.tz_localize(None).astype("datetime64[D]").
            isin(demo_dates),
        ),
        ("daterange", "Date >= \"{0}\" & Date <= \"{1}\"".format(*datelims)),
        ("rem", None),
    ),
    "gt",
)

tree = tree.branch([('lt1e5', 'In > 1e5 | Out > 1e5')], 'contains_debit')

tree

In [None]:
reduce_description(demodf, list(set(tree.get_data(None)["Description"])))

In [None]:
for k, v in tree.traverse_leaves():
    print(k)
    display(v.data)
    print()

In [None]:
treecp = copy.deepcopy(tree)
res = sieve.Results()

res.picker('gt').pick_leaf('date_isin', treecp.get_leaf('gt', 'date_isin'))

res.picker('gt').pick_leaves(
    filter(lambda km: km[0] != 'daterange', treecp.traverse_leaves('gt')))

res.picker('lt').pick_leaves(((k[0], treecp.get_leaf(*k[1:])) for k in (
    ('isin', 'str_isin'),
    ('deb_llt', 'contains_debit', 'lt1e5'),
    ('misc', 'contains_misc'),
)))

res.picker('down', 'we', 'go').pick_leaf('rem', treecp.get_leaf(None))

treecp

In [None]:
res

In [None]:
res.picker('gt').merged()

In [None]:
res.picker('down').merged()