In [17]:
import pandas as pd
import numpy as np

import sys, pathlib, fitz

from dateutil import parser

### Data Import

In [20]:
mint_transaction_history = pd.read_csv('sourceData/mint_2019_2024.csv')

mint_transaction_history.head(10)

mint_transaction_history['Account Name'].unique()

mint_account_aliases = {
    'TD BEYOND CHECKING': 'Checking XXXX',
    'HYSA': 'American Express Savings',
    'Individual ...959': 'Charles Schwab Individual Investment Account',
    'CREDITCARD Account': 'Capital One Credit Card'
}

mint_transaction_history['Account Name'].replace(mint_account_aliases, inplace=True)

mint_transaction_history

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  mint_transaction_history['Account Name'].replace(mint_account_aliases, inplace=True)


Unnamed: 0,Date,Description,Original Description,Amount,Transaction Type,Category,Account Name,Labels,Notes
0,12/30/2023,LinkedIn,LINKEDIN PRE 975246241,42.49,debit,Career,Apple Card,,
1,12/29/2023,VERIS INSIGHTS L PAYROLL,VERIS INSIGHTS L PAYROLL,2315.42,credit,Paycheck,Checking XXXX,,
2,12/29/2023,Blue Bottle Coffee,SQ *BLUE BOTTLE COFF,11.24,debit,Coffee Shops,Checking XXXX,,
3,12/29/2023,VERIS INSIGHTS L PAYROLL,VERIS INSIGHTS L PAYROLL,408.60,credit,Paycheck,American Express Savings,,
4,12/29/2023,MBTA KENMORE,MBTA KENMORE,20.00,debit,Public Transportation,Apple Card,,
...,...,...,...,...,...,...,...,...,...
7346,8/10/2018,Starbucks,469216 STARBUCKS STORE 00833469216 STA...,3.37,debit,Coffee Shops,Checking XXXX,,
7347,8/10/2018,Sweetgreen Seaport Squ,469216 SWEETGREEN SEAPORT SQ469216 SWE...,13.32,debit,Restaurants,Checking XXXX,,
7348,8/09/2018,Starbucks,469216 STARBUCKS STORE 00875469216 STA...,3.47,debit,Coffee Shops,Checking XXXX,,
7349,8/08/2018,Starbucks,469216 STARBUCKS STORE 00875469216 STA...,2.94,debit,Coffee Shops,Checking XXXX,,


### CSV (Mint Transaction History) Handling

In [23]:
mint_transaction_history['Date'] = pd.to_datetime(mint_transaction_history['Date'], format='mixed', dayfirst=False, yearfirst=False)

Unnamed: 0,Date,Description,Original Description,Amount,Transaction Type,Category,Account Name,Labels,Notes
0,2023-12-30,LinkedIn,LINKEDIN PRE 975246241,42.49,debit,Career,Apple Card,,
1,2023-12-29,VERIS INSIGHTS L PAYROLL,VERIS INSIGHTS L PAYROLL,2315.42,credit,Paycheck,Checking XXXX,,
2,2023-12-29,Blue Bottle Coffee,SQ *BLUE BOTTLE COFF,11.24,debit,Coffee Shops,Checking XXXX,,
3,2023-12-29,VERIS INSIGHTS L PAYROLL,VERIS INSIGHTS L PAYROLL,408.60,credit,Paycheck,American Express Savings,,
4,2023-12-29,MBTA KENMORE,MBTA KENMORE,20.00,debit,Public Transportation,Apple Card,,
...,...,...,...,...,...,...,...,...,...
7346,2018-08-10,Starbucks,469216 STARBUCKS STORE 00833469216 STA...,3.37,debit,Coffee Shops,Checking XXXX,,
7347,2018-08-10,Sweetgreen Seaport Squ,469216 SWEETGREEN SEAPORT SQ469216 SWE...,13.32,debit,Restaurants,Checking XXXX,,
7348,2018-08-09,Starbucks,469216 STARBUCKS STORE 00875469216 STA...,3.47,debit,Coffee Shops,Checking XXXX,,
7349,2018-08-08,Starbucks,469216 STARBUCKS STORE 00875469216 STA...,2.94,debit,Coffee Shops,Checking XXXX,,


### PDF (Original Balance) Handling

In [7]:
file_name = 'sourceData/OriginalBalance_PDF_8_2018.pdf'

with fitz.open(file_name) as orig_pdf:

    full_text = chr(12).join([page.get_text() for page in orig_pdf])

pathlib.Path('workingFiles/extractedText.txt').write_bytes(full_text.encode())

13245

In [19]:
full_doc = fitz.open('sourceData/OriginalBalance_PDF_8_2018.pdf')
cover_page = full_doc[0]
words = cover_page.get_text("words", sort=True)

for i, word_obj in enumerate(words):

    text = word_obj[4]

    if text == 'Ending':

        if words[i + 1][4] == 'Balance':

            end_bal = words[i + 2][4]

            print(text)
            print(words[i + 1][4])
            print(words[i + 2][4])

    if text == 'Statement':

        if words[i + 1][4] == 'Period:':

            end_month = words[i + 4][4].split('-')[1]
            end_day = words[i + 5][4]
            end_year = words[i + 6][4]

            full_statement_end_str = end_month + "-" + end_day + "-" + end_year

            print(text)
            print(words[i + 1][4])
            print(f"{end_month} {end_day} {end_year}")


full_statement_date = parser.parse(full_statement_end_str)

Statement
Period:
Aug 07 2018
Ending
Balance
1,867.51


datetime.datetime(2018, 8, 7, 0, 0)