In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
df = pd.read_parquet('data/data.pqt')

# remove rows where memo is the same as category_description
df = df[df["memo_clean"] != df["category_description"]]
df.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo_clean,amount,category_description
0,0,acc_0,Kroger,20.98,GROCERIES
1,0,acc_0,CASH APP*FREE XXXXXXXXXX CA XX/XX,200.0,GENERAL_MERCHANDISE
7,0,acc_0,Amazon.com*HXXXWXXQX Amzn.com/bill WA XX/XX,33.2,GENERAL_MERCHANDISE
9,0,acc_0,Amazon,42.79,GENERAL_MERCHANDISE
10,0,acc_0,Kroger,36.55,GROCERIES


In [3]:
df.shape

(1283746, 5)

In [4]:
df["memo_clean"] = df["memo_clean"].str.upper()
df.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo_clean,amount,category_description
0,0,acc_0,KROGER,20.98,GROCERIES
1,0,acc_0,CASH APP*FREE XXXXXXXXXX CA XX/XX,200.0,GENERAL_MERCHANDISE
7,0,acc_0,AMAZON.COM*HXXXWXXQX AMZN.COM/BILL WA XX/XX,33.2,GENERAL_MERCHANDISE
9,0,acc_0,AMAZON,42.79,GENERAL_MERCHANDISE
10,0,acc_0,KROGER,36.55,GROCERIES


In [5]:
def get_text(text):
    return " ".join(re.findall(r"(?!\W|X)(\w+\*\w+|\w+)", text))
    # return " ".join(re.findall("(?!\s|X)\w+|(\*\w)|\s{1}", text))
upper_memos = df["memo_clean"].apply(get_text)
upper_memos

0                                         KROGER
1                               CASH APP*FREE CA
7          AMAZON COM*HXXXWXXQX AMZN COM BILL WA
9                                         AMAZON
10                                        KROGER
                           ...                  
5200974                                    APPLE
5200990                   HOME DEPOT CREDIT CARD
5201000                                    APPLE
5201005                                   MACY S
5201010                   HOME DEPOT CREDIT CARD
Name: memo_clean, Length: 1283746, dtype: object

In [6]:
def get_text_alt(text):
    text = "".join(re.findall(r"(?!(?!\*|\.|\s|-)\W).", text))
    return " ".join(re.findall(r"\b(?!COM|X{2,}|\w+X{3,})\w+|\*\b", text))
alt_upper_memos = df["memo_clean"].apply(get_text_alt)
alt_upper_memos

0                          KROGER
1              CASH APP * FREE CA
7                AMAZON * AMZN WA
9                          AMAZON
10                         KROGER
                    ...          
5200974                     APPLE
5200990    HOME DEPOT CREDIT CARD
5201000                     APPLE
5201005                     MACYS
5201010    HOME DEPOT CREDIT CARD
Name: memo_clean, Length: 1283746, dtype: object

In [7]:
def test(text):
    return " ".join(re.findall(".*\*", text))

star_memos = alt_upper_memos.apply(test)
star_memos = star_memos[star_memos != ""]
star_memos

1          CASH APP *
7            AMAZON *
24         CASH APP *
30         CASH APP *
35         CASH APP *
              ...    
5197087      AMAZON *
5197094      AMAZON *
5197143         DNH *
5197158         EIG *
5197200         DNH *
Name: memo_clean, Length: 163608, dtype: object

In [8]:
star_memos.unique(), star_memos.nunique()

(array(['CASH APP *', 'AMAZON *', 'AMZN DIGITAL *', ...,
        'EBAY INCXQULXKLL PAYMENTS *', 'CASH APP * DANNI *', 'MVQ *'],
       dtype=object),
 5936)

In [9]:
with open('memos/alt_star_memos.txt', 'w') as f:
    for memo in star_memos.unique():
        f.write(memo + "\n")

In [10]:
df["memo_clean"].head()

0                                          KROGER
1               CASH APP*FREE XXXXXXXXXX CA XX/XX
7     AMAZON.COM*HXXXWXXQX AMZN.COM/BILL WA XX/XX
9                                          AMAZON
10                                         KROGER
Name: memo_clean, dtype: object

In [11]:
POS_memos = alt_upper_memos.apply(lambda text: "".join(re.findall(r"POS\s.+", text)))
POS_memos = POS_memos[POS_memos != ""]
POS_memos

1371                             POS PURCHASE UNITED DAIRY F
1372                             POS PURCHASE UNITED DAIRY F
1374                                 POS PURCHASE DOLLARTREE
1380                             POS PURCHASE DILLONVALE IGA
1407                             POS PURCHASE DILLONVALE IGA
                                 ...                        
5178142     POS DEBIT DDA WALMART POS DEB BENTONVILLEAR CARD
5178143     POS DEBIT DDA WALMART POS DEB BENTONVILLEAR CARD
5178145    POS RECURRING DEBIT DDA DBT CRD FSP * GRAPE CR...
5178146            POS ATM DEBIT DBT CRD APPLE CASH SEN CARD
5178152           POS RECURRING DEBIT DDA DBT CRD APPLE CARD
Name: memo_clean, Length: 78747, dtype: object

In [12]:
df.loc[POS_memos.index].groupby("category_description").count()

Unnamed: 0_level_0,prism_consumer_id,prism_account_id,memo_clean,amount
category_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
EDUCATION,219,219,219,219
FOOD_AND_BEVERAGES,26532,26532,26532,26532
GENERAL_MERCHANDISE,34685,34685,34685,34685
GROCERIES,14610,14610,14610,14610
OVERDRAFT,6,6,6,6
PETS,547,547,547,547
RENT,57,57,57,57
TRAVEL,2091,2091,2091,2091


In [13]:
df_clean = df.copy()
df_clean["memo_clean"] = alt_upper_memos
df_clean.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo_clean,amount,category_description
0,0,acc_0,KROGER,20.98,GROCERIES
1,0,acc_0,CASH APP * FREE CA,200.0,GENERAL_MERCHANDISE
7,0,acc_0,AMAZON * AMZN WA,33.2,GENERAL_MERCHANDISE
9,0,acc_0,AMAZON,42.79,GENERAL_MERCHANDISE
10,0,acc_0,KROGER,36.55,GROCERIES


In [14]:
# Save cleaned memos to clean_data
df_clean.to_parquet("data/data_clean.pqt")