Trying to reproduce the paper [Scalable and Weakly Supervised Bank Transaction Classification](https://arxiv.org/abs/2305.18430), follow the article of [No Labels? No Problem! A Better Way to Classify Bank Transaction Data](https://medium.com/@echo_neath_ashtrees/no-labels-no-problem-a-better-way-to-classify-bank-transaction-data-73380ce20734)

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv('../data/CSVData.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/CSVData.csv'

In [None]:
data.head(10)

In [None]:
missing_value_cnt = data.isnull().sum()
missing_value_cnt

Dataset from the CommomBank is quite clean.

In [None]:
# only need description data to train the categorizer
description = data['Description']
description

### Step 1: NLP bank description text normalisation and grouping

In [None]:
# text normalisation
# convert to lower case
description = description.str.lower()
# remove numbers
description = description.str.replace(r'\d+', '', regex=True)
# remove all punctuation except words and space
description = description.str.replace(r'[^\w\s]', '', regex=True)
# remove white spaces
description = description.str.strip()

# remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
description = description.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# not sure if these words are useless, may comment them in the future
# remove useless words
useless = ['au', 'aus', 'card', 'xx', 'value', 'date']
description = description.apply(lambda x: ' '.join([word for word in x.split() if word not in (useless)]))

description

In [None]:
# grouping
# convert Series to Dataframe
dsc_df = description.to_frame()
dsc_df.columns = ['Name']

# groupby name
dsc_group_df = dsc_df.groupby(['Name']).size().to_frame()
dsc_group_df.columns = ['Count']
dsc_group_df = dsc_group_df.sort_values('Count', ascending=False)
dsc_group_df

### Step 2: weak label generation

In [None]:
from snorkel.labeling import labeling_function 

ABSTAIN = -1
MATCH = 1

@labeling_function()
def lf_heur_amount(x):
  #Sample labelling function using heuristic
  if (
      (x["amount_max"] >= 100)
      and (x["amount_median"] >= 10)
  ):
      return MATCH
  else:
      return ABSTAIN

match_regexes = ["fee", "bank", "cash", "atm"]

@labeling_function()
def lf_regex_text(x):
  # Sample text match labelling function
  if bool(re.search(match_regexes, x["clean_text"])):
    return MATCH
  else:
    return ABSTAIN

lfs = [lf_heur_amount, lf_regex_text]

In [None]:
from snorkel.labeling import PandasLFApplier, LFAnalysis
from snorkel.labeling.model.label_model import LabelModel
# https://snorkel.readthedocs.io/en/v0.9.7/packages/_autosummary/labeling/snorkel.labeling.model.label_model.LabelModel.html?highlight=labelmodel

import pandas as pd

# Apply the LFs to the data
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=dsc_group_df)

# Fit the label model and get the training labels
label_model = LabelModel(cardinality=2, verbose=True)  # assume binary classification
label_model.fit(L_train=L_train, n_epochs=500, log_freq=50, seed=123)
dsc_group_df["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")