# Import Important Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
DATA_PATH = '../../data'

# Import Raw Data

In [3]:
raw_df_1 = pd.read_parquet(f'{DATA_PATH}/raw/Transacation_outflows_with_date_3k_firsthalf.pqt')
raw_df_2 = pd.read_parquet(f'{DATA_PATH}/raw/Transacation_outflows_with_date_3k_secondhalf.pqt')

df = pd.concat([raw_df_1, raw_df_2])
# remove rows where memo is the same as category_description
df = df[df["memo"] != df["category"]]
df.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
6,0,acc_0,Oculus CA 04/16,11.73,2022-04-18,GENERAL_MERCHANDISE
7,0,acc_0,LOS GIRASOLES STOW OH 03/08,30.04,2022-03-09,FOOD_AND_BEVERAGES
8,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,4.16,2022-03-29,GENERAL_MERCHANDISE


In [4]:
df.shape

(1306452, 6)

# Memo Cleaning

In [5]:
df["memo"] = df["memo"].str.upper()
df.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
2,0,acc_0,TST* CASA DEL RIO - EXP FAIRLAWN OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
4,0,acc_0,BUFFALO WILD WINGS,26.47,2022-09-12,FOOD_AND_BEVERAGES
6,0,acc_0,OCULUS CA 04/16,11.73,2022-04-18,GENERAL_MERCHANDISE
7,0,acc_0,LOS GIRASOLES STOW OH 03/08,30.04,2022-03-09,FOOD_AND_BEVERAGES
8,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,4.16,2022-03-29,GENERAL_MERCHANDISE


In [6]:
def get_text(text):
    return " ".join(re.findall(r"(?!\W|X|\d)(\w+\*\w+|\w+)", text))
    # return " ".join(re.findall("(?!\s|X)\w+|(\*\w)|\s{1}", text))
upper_memos = df["memo"].apply(get_text)
upper_memos

2                           TST CASA DEL RIO EXP FAIRLAWN OH
4                                         BUFFALO WILD WINGS
6                                                  OCULUS CA
7                                      LOS GIRASOLES STOW OH
8                                          BUZZIS LAUNDRY OH
                                 ...                        
5195447    DEBIT CARD WITHDRAWAL PURCHASEAMAZON PRIME*TI4...
5195452    POS WITHDRAWALAZ LOT QUIKTRIP E INDIAN SCHOOL ...
5195455    POS WITHDRAWALWAL MART E MCKELLIPS RD MESA AZ ...
5195458    WITHDRAWAL SALT RIVER PROJETYPE ONLINE PMT CO ...
5195466     POS WITHDRAWALFRYS FOOD DRG S E MESA AZ CARD MCC
Name: memo, Length: 1306452, dtype: object

In [7]:
def get_text_alt(text):
    text = "".join(re.findall(r"(?!(?!\*|\.|\s|-)\W).", text))
    return " ".join(re.findall(r"\b(?!COM|X{2,}|\w+X{3,}|\d)\w+|\*\b", text))
alt_upper_memos = df["memo"].apply(get_text_alt)
alt_upper_memos

2                           TST CASA DEL RIO EXP FAIRLAWN OH
4                                         BUFFALO WILD WINGS
6                                                  OCULUS CA
7                                      LOS GIRASOLES STOW OH
8                                          BUZZIS LAUNDRY OH
                                 ...                        
5195447    DEBIT CARD WITHDRAWAL PURCHASEAMAZON PRIME * T...
5195452    POS WITHDRAWALAZ LOT QUIKTRIP E INDIAN SCHOOL ...
5195455    POS WITHDRAWALWAL MART E MCKELLIPS RD MESA AZ ...
5195458    WITHDRAWAL SALT RIVER PROJETYPE ONLINE PMT COS...
5195466     POS WITHDRAWALFRYS FOOD DRG S E MESA AZ CARD MCC
Name: memo, Length: 1306452, dtype: object

In [8]:
df_clean = df.copy()
df_clean["memo"] = alt_upper_memos
df_clean.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
2,0,acc_0,TST CASA DEL RIO EXP FAIRLAWN OH,18.42,2022-09-26,FOOD_AND_BEVERAGES
4,0,acc_0,BUFFALO WILD WINGS,26.47,2022-09-12,FOOD_AND_BEVERAGES
6,0,acc_0,OCULUS CA,11.73,2022-04-18,GENERAL_MERCHANDISE
7,0,acc_0,LOS GIRASOLES STOW OH,30.04,2022-03-09,FOOD_AND_BEVERAGES
8,0,acc_0,BUZZIS LAUNDRY OH,4.16,2022-03-29,GENERAL_MERCHANDISE


# Date Cleaning

In [14]:
df_clean['year'] = pd.DatetimeIndex(df_clean['posted_date']).year
df_clean['month'] = pd.DatetimeIndex(df_clean['posted_date']).month
df_clean['day'] = pd.DatetimeIndex(df_clean['posted_date']).day

In [15]:
df_clean.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,year,month,day
2,0,acc_0,TST CASA DEL RIO EXP FAIRLAWN OH,18.42,2022-09-26,FOOD_AND_BEVERAGES,2022,9,26
4,0,acc_0,BUFFALO WILD WINGS,26.47,2022-09-12,FOOD_AND_BEVERAGES,2022,9,12
6,0,acc_0,OCULUS CA,11.73,2022-04-18,GENERAL_MERCHANDISE,2022,4,18
7,0,acc_0,LOS GIRASOLES STOW OH,30.04,2022-03-09,FOOD_AND_BEVERAGES,2022,3,9
8,0,acc_0,BUZZIS LAUNDRY OH,4.16,2022-03-29,GENERAL_MERCHANDISE,2022,3,29


In [16]:
# Save cleaned memos to clean_data
df_clean.to_parquet(f"{DATA_PATH}/processed/data_clean.pqt")