# Chapter 20 - Project: Reading and cleaning a QuickBooks general ledger

In [2]:
import pandas as pd

In [3]:
ledger_df = pd.read_excel('data/QuickBooks GL.xlsx')
ledger_df

Unnamed: 0,Carl's Design and Landscaping Services,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,General Ledger,,,,,,,,,
1,All Dates,,,,,,,,,
2,,,,,,,,,,
3,Acct,SubAcct,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
4,Checking,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
443,,,12/28/2020,Bill,,Brosnahan Insurance Agency,,2000,,2916
444,Total for Miscellaneous,,,,,,,2916,,
445,Not Specified,,,,,,,,,
446,,,12/18/2020,Payment,,Amy's Bird Sanctuary,Created by QB Online to link credits to charges.,0,,0


In [4]:
# Skip the first four rows when reading
ledger_df = pd.read_excel('data/QuickBooks GL.xlsx', skiprows=4)
ledger_df

Unnamed: 0,Acct,SubAcct,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
0,Checking,,,,,,,,,
1,,,08/21/2020,Deposit,,,Opening Balance,5000,,5000.0
2,,,10/02/2020,Bill Payment (Check),10,Robertson & Associates,,,300,4700.0
3,,,10/10/2020,Payment,1053,Bill's Windsurf Shop,,175,,4875.0
4,,,10/24/2020,Expense,12,Robertson & Associates,,,250,4625.0
...,...,...,...,...,...,...,...,...,...,...
439,,,12/28/2020,Bill,,Brosnahan Insurance Agency,,2000,,2916.0
440,Total for Miscellaneous,,,,,,,2916,,
441,Not Specified,,,,,,,,,
442,,,12/18/2020,Payment,,Amy's Bird Sanctuary,Created by QB Online to link credits to charges.,0,,0.0


In [5]:
# Drop columns where all values are NaN (empty), keeping only columns with at least one non-NaN value
ledger_df = ledger_df.dropna(how='all', axis='columns')  

# Drop rows where all values are NaN (empty), keeping only rows with at least one non-NaN value
ledger_df = ledger_df.dropna(how='all', axis='rows')  

# Rename columns: 'Acct' to 'Account' and 'SubAcct' to 'SubAccount'
ledger_df = ledger_df.rename(columns={'Acct': 'Account', 'SubAcct': 'SubAccount'})  

ledger_df

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
0,Checking,,,,,,,,,
1,,,08/21/2020,Deposit,,,Opening Balance,5000,,5000.0
2,,,10/02/2020,Bill Payment (Check),10,Robertson & Associates,,,300,4700.0
3,,,10/10/2020,Payment,1053,Bill's Windsurf Shop,,175,,4875.0
4,,,10/24/2020,Expense,12,Robertson & Associates,,,250,4625.0
...,...,...,...,...,...,...,...,...,...,...
439,,,12/28/2020,Bill,,Brosnahan Insurance Agency,,2000,,2916.0
440,Total for Miscellaneous,,,,,,,2916,,
441,Not Specified,,,,,,,,,
442,,,12/18/2020,Payment,,Amy's Bird Sanctuary,Created by QB Online to link credits to charges.,0,,0.0


In [6]:
# Forward fill missing (NaN) values in the 'Account' column with the previous non-null value
ledger_df['Account'] = ledger_df['Account'].ffill()  
ledger_df

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
0,Checking,,,,,,,,,
1,Checking,,08/21/2020,Deposit,,,Opening Balance,5000,,5000.0
2,Checking,,10/02/2020,Bill Payment (Check),10,Robertson & Associates,,,300,4700.0
3,Checking,,10/10/2020,Payment,1053,Bill's Windsurf Shop,,175,,4875.0
4,Checking,,10/24/2020,Expense,12,Robertson & Associates,,,250,4625.0
...,...,...,...,...,...,...,...,...,...,...
439,Miscellaneous,,12/28/2020,Bill,,Brosnahan Insurance Agency,,2000,,2916.0
440,Total for Miscellaneous,,,,,,,2916,,
441,Not Specified,,,,,,,,,
442,Not Specified,,12/18/2020,Payment,,Amy's Bird Sanctuary,Created by QB Online to link credits to charges.,0,,0.0


In [7]:
ledger_df['Account'].value_counts()

Account
Landscaping Services                 61
Accounts Receivable (A/R)            49
Checking                             45
Accounts Payable (A/P)               26
Inventory Asset                      21
                                     ..
Total for Checking                    1
Total for Sales of Product Income     1
Total for Services                    1
Total for Cost of Goods Sold          1
Total for Not Specified               1
Name: count, Length: 66, dtype: int64

In [8]:
ledger_df['SubAccount'].value_counts()

SubAccount
Total for Job Materials                    2
Job Materials                              2
Plants and Soil                            2
Total for Plants and Soil                  2
Sprinklers and Drip Systems                2
Total for Sprinklers and Drip Systems      2
Total for Maintenance and Repair           2
Total for Lawyer                           1
Accounting                                 1
Total for Accounting                       1
Bookkeeper                                 1
Total for Bookkeeper                       1
Lawyer                                     1
Original Cost                              1
Equipment Repairs                          1
Total for Equipment Repairs                1
Total for Decks and Patios                 1
Gas and Electric                           1
Total for Gas and Electric                 1
Telephone                                  1
Total for Legal & Professional Fees        1
Fuel                                       1

The function bellow first checks if name is not NaN (using pandas’s
notna helper function), then checks if name contains the word
'Total'. It returns the combined value of these checks as a boolean
(i.e., either True if both checks are True , or False if either of them
is False ). The first check isn’t strictly necessary, but it prevents
the function from triggering an error if it gets a NaN value as its
input.

In [10]:
# Define a function named 'is_subtotal' that checks if a string contains 'Total' and is not NaN
def is_subtotal(name):  
    return pd.notna(name) and 'Total' in name  # Check if 'name' is not NaN and if it contains the substring 'Total'

In [11]:
# Test 1
is_subtotal('Accounts Receivable (A/R)')

False

In [12]:
# Test 2
is_subtotal('Total for Notes Payable')

True

In [13]:
valid_accounts = [name for name in ledger_df['Account'].unique() if not is_subtotal(name)]
valid_accounts

['Checking',
 'Savings',
 'Accounts Receivable (A/R)',
 'Inventory Asset',
 'Undeposited Funds',
 'Truck',
 'Accounts Payable (A/P)',
 'Mastercard',
 'Arizona Dept. of Revenue Payable',
 'Board of Equalization Payable',
 'Loan Payable',
 'Notes Payable',
 'Opening Balance Equity',
 'Design income',
 'Discounts given',
 'Landscaping Services',
 'Pest Control Services',
 'Sales of Product Income',
 'Services',
 'Cost of Goods Sold',
 'Advertising',
 'Automobile',
 'Equipment Rental',
 'Insurance',
 'Job Expenses',
 'Legal & Professional Fees',
 'Maintenance and Repair',
 'Meals and Entertainment',
 'Office Expenses',
 'Rent or Lease',
 'Utilities',
 'Miscellaneous',
 'Not Specified']

In [14]:
# Another way:

valid_accounts = []

for name in ledger_df['Account'].unique():
    if not is_subtotal(name):
        valid_accounts.append(name)

valid_accounts

['Checking',
 'Savings',
 'Accounts Receivable (A/R)',
 'Inventory Asset',
 'Undeposited Funds',
 'Truck',
 'Accounts Payable (A/P)',
 'Mastercard',
 'Arizona Dept. of Revenue Payable',
 'Board of Equalization Payable',
 'Loan Payable',
 'Notes Payable',
 'Opening Balance Equity',
 'Design income',
 'Discounts given',
 'Landscaping Services',
 'Pest Control Services',
 'Sales of Product Income',
 'Services',
 'Cost of Goods Sold',
 'Advertising',
 'Automobile',
 'Equipment Rental',
 'Insurance',
 'Job Expenses',
 'Legal & Professional Fees',
 'Maintenance and Repair',
 'Meals and Entertainment',
 'Office Expenses',
 'Rent or Lease',
 'Utilities',
 'Miscellaneous',
 'Not Specified']

In [15]:
# Filter the 'ledger_df' DataFrame to keep only rows where 'Account' is in the 'valid_accounts' list
ledger_df = ledger_df[ledger_df['Account'].isin(valid_accounts)]  
ledger_df  

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
0,Checking,,,,,,,,,
1,Checking,,08/21/2020,Deposit,,,Opening Balance,5000,,5000.0
2,Checking,,10/02/2020,Bill Payment (Check),10,Robertson & Associates,,,300,4700.0
3,Checking,,10/10/2020,Payment,1053,Bill's Windsurf Shop,,175,,4875.0
4,Checking,,10/24/2020,Expense,12,Robertson & Associates,,,250,4625.0
...,...,...,...,...,...,...,...,...,...,...
437,Miscellaneous,,11/28/2020,Bill,,Hicks Hardware,,250,,250.0
438,Miscellaneous,,12/20/2020,Bill,,Tim Philip Masonry,,666,,916.0
439,Miscellaneous,,12/28/2020,Bill,,Brosnahan Insurance Agency,,2000,,2916.0
441,Not Specified,,,,,,,,,


In [16]:
def is_subtotal(name):                         # Define a function that checks if a name is not NaN and contains the word 'Total'
    return pd.notna(name) and 'Total' in name  # Returns True if 'name' is not NaN and contains 'Total'

valid_accounts = [name for name in ledger_df['Account'].unique()  # Create a list of unique 'Account' values from 'ledger_df'
    if not is_subtotal(name)]                                     # Include only accounts that do not contain 'Total'

valid_subaccounts = [name for name in ledger_df['SubAccount'].unique()  # Create a list of unique 'SubAccount' values from 'ledger_df'
    if not is_subtotal(name)]                                           # Include only subaccounts that do not contain 'Total'

ledger_df = ledger_df[ledger_df['Account'].isin(valid_accounts)]        # Filter the DataFrame to keep rows where 'Account' is in valid_accounts
ledger_df = ledger_df[ledger_df['SubAccount'].isin(valid_subaccounts)]  # Filter the DataFrame to keep rows where 'SubAccount' is in valid_subaccounts

ledger_df 

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
0,Checking,,,,,,,,,
1,Checking,,08/21/2020,Deposit,,,Opening Balance,5000,,5000.0
2,Checking,,10/02/2020,Bill Payment (Check),10,Robertson & Associates,,,300,4700.0
3,Checking,,10/10/2020,Payment,1053,Bill's Windsurf Shop,,175,,4875.0
4,Checking,,10/24/2020,Expense,12,Robertson & Associates,,,250,4625.0
...,...,...,...,...,...,...,...,...,...,...
437,Miscellaneous,,11/28/2020,Bill,,Hicks Hardware,,250,,250.0
438,Miscellaneous,,12/20/2020,Bill,,Tim Philip Masonry,,666,,916.0
439,Miscellaneous,,12/28/2020,Bill,,Brosnahan Insurance Agency,,2000,,2916.0
441,Not Specified,,,,,,,,,


In [17]:
ledger_df['Account'].value_counts()

Account
Landscaping Services                53
Accounts Receivable (A/R)           49
Checking                            45
Accounts Payable (A/P)              26
Inventory Asset                     21
Undeposited Funds                   21
Mastercard                          18
Board of Equalization Payable       17
Job Expenses                        15
Opening Balance Equity              14
Automobile                          13
Legal & Professional Fees           11
Sales of Product Income              9
Pest Control Services                9
Cost of Goods Sold                   7
Utilities                            7
Design income                        7
Discounts given                      4
Meals and Entertainment              4
Miscellaneous                        4
Arizona Dept. of Revenue Payable     4
Maintenance and Repair               4
Savings                              3
Services                             3
Truck                                3
Rent or Lease    

In [18]:
ledger_df[ledger_df['Account'] == 'Utilities']

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
426,Utilities,,,,,,,,,
427,Utilities,Gas and Electric,,,,,,,,
428,Utilities,,11/20/2020,Bill,,PG&E,,86.44,,86.44
429,Utilities,,12/19/2020,Bill,,PG&E,,114.09,,200.53
431,Utilities,Telephone,,,,,,,,
432,Utilities,,11/19/2020,Bill,,Cal Telephone,,56.5,,56.5
433,Utilities,,12/19/2020,Bill,,Cal Telephone,Monthly Phone Bill,74.36,,130.86


In [19]:
# Filter 'ledger_df' to include only rows where the 'Account' column contains 'Utilities' or 'Miscellaneous'
example_df = ledger_df[ledger_df['Account'].isin(['Utilities', 'Miscellaneous'])]
example_df

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
426,Utilities,,,,,,,,,
427,Utilities,Gas and Electric,,,,,,,,
428,Utilities,,11/20/2020,Bill,,PG&E,,86.44,,86.44
429,Utilities,,12/19/2020,Bill,,PG&E,,114.09,,200.53
431,Utilities,Telephone,,,,,,,,
432,Utilities,,11/19/2020,Bill,,Cal Telephone,,56.5,,56.5
433,Utilities,,12/19/2020,Bill,,Cal Telephone,Monthly Phone Bill,74.36,,130.86
436,Miscellaneous,,,,,,,,,
437,Miscellaneous,,11/28/2020,Bill,,Hicks Hardware,,250.0,,250.0
438,Miscellaneous,,12/20/2020,Bill,,Tim Philip Masonry,,666.0,,916.0


In [20]:
# Forward fill missing (NaN) values in the 'SubAccount' column with the previous non-null value
example_df.loc[:, 'SubAccount'] = example_df['SubAccount'].ffill()
example_df

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
426,Utilities,,,,,,,,,
427,Utilities,Gas and Electric,,,,,,,,
428,Utilities,Gas and Electric,11/20/2020,Bill,,PG&E,,86.44,,86.44
429,Utilities,Gas and Electric,12/19/2020,Bill,,PG&E,,114.09,,200.53
431,Utilities,Telephone,,,,,,,,
432,Utilities,Telephone,11/19/2020,Bill,,Cal Telephone,,56.5,,56.5
433,Utilities,Telephone,12/19/2020,Bill,,Cal Telephone,Monthly Phone Bill,74.36,,130.86
436,Miscellaneous,Telephone,,,,,,,,
437,Miscellaneous,Telephone,11/28/2020,Bill,,Hicks Hardware,,250.0,,250.0
438,Miscellaneous,Telephone,12/20/2020,Bill,,Tim Philip Masonry,,666.0,,916.0


In [21]:
# Forward fill missing (NaN) values in the 'SubAccount' column with the next non-null value
example_df.loc[:, 'SubAccount'] = example_df['SubAccount'].bfill()
example_df

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
426,Utilities,Gas and Electric,,,,,,,,
427,Utilities,Gas and Electric,,,,,,,,
428,Utilities,Gas and Electric,11/20/2020,Bill,,PG&E,,86.44,,86.44
429,Utilities,Gas and Electric,12/19/2020,Bill,,PG&E,,114.09,,200.53
431,Utilities,Telephone,,,,,,,,
432,Utilities,Telephone,11/19/2020,Bill,,Cal Telephone,,56.5,,56.5
433,Utilities,Telephone,12/19/2020,Bill,,Cal Telephone,Monthly Phone Bill,74.36,,130.86
436,Miscellaneous,Telephone,,,,,,,,
437,Miscellaneous,Telephone,11/28/2020,Bill,,Hicks Hardware,,250.0,,250.0
438,Miscellaneous,Telephone,12/20/2020,Bill,,Tim Philip Masonry,,666.0,,916.0


Notice the 'SubAccount' value in the last four rows is 'Telephone',
which isn’t a valid miscellaneous sub-account. To fill in missing
values in the 'SubAccount' column, you need to call fillna on
each account separately. We can do that with a for loop and the
loc slicing operator:

In [23]:
import numpy as np

# Apply the condition to set 'SubAccount' to NaN where 'Account' == 'Miscellaneous' and 'SubAccount' == 'Telephone'
example_df.loc[(example_df['Account'] == 'Miscellaneous') & (example_df['SubAccount'] == 'Telephone'), 'SubAccount'] = np.nan

example_df

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
426,Utilities,Gas and Electric,,,,,,,,
427,Utilities,Gas and Electric,,,,,,,,
428,Utilities,Gas and Electric,11/20/2020,Bill,,PG&E,,86.44,,86.44
429,Utilities,Gas and Electric,12/19/2020,Bill,,PG&E,,114.09,,200.53
431,Utilities,Telephone,,,,,,,,
432,Utilities,Telephone,11/19/2020,Bill,,Cal Telephone,,56.5,,56.5
433,Utilities,Telephone,12/19/2020,Bill,,Cal Telephone,Monthly Phone Bill,74.36,,130.86
436,Miscellaneous,,,,,,,,,
437,Miscellaneous,,11/28/2020,Bill,,Hicks Hardware,,250.0,,250.0
438,Miscellaneous,,12/20/2020,Bill,,Tim Philip Masonry,,666.0,,916.0


In [24]:
# Drop rows where all values in columns starting from the 3rd column (index 2) are NaN
example_df = example_df.dropna(subset=example_df.columns[2:], how='all')  

example_df

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
428,Utilities,Gas and Electric,11/20/2020,Bill,,PG&E,,86.44,,86.44
429,Utilities,Gas and Electric,12/19/2020,Bill,,PG&E,,114.09,,200.53
432,Utilities,Telephone,11/19/2020,Bill,,Cal Telephone,,56.5,,56.5
433,Utilities,Telephone,12/19/2020,Bill,,Cal Telephone,Monthly Phone Bill,74.36,,130.86
437,Miscellaneous,,11/28/2020,Bill,,Hicks Hardware,,250.0,,250.0
438,Miscellaneous,,12/20/2020,Bill,,Tim Philip Masonry,,666.0,,916.0
439,Miscellaneous,,12/28/2020,Bill,,Brosnahan Insurance Agency,,2000.0,,2916.0


After all this scrubbing, what can you do with `ledger_df`? For
instance, you can easily slice it to get postings in various accounts
or sub-accounts:

In [26]:
ledger_df

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
0,Checking,,,,,,,,,
1,Checking,,08/21/2020,Deposit,,,Opening Balance,5000,,5000.0
2,Checking,,10/02/2020,Bill Payment (Check),10,Robertson & Associates,,,300,4700.0
3,Checking,,10/10/2020,Payment,1053,Bill's Windsurf Shop,,175,,4875.0
4,Checking,,10/24/2020,Expense,12,Robertson & Associates,,,250,4625.0
...,...,...,...,...,...,...,...,...,...,...
437,Miscellaneous,,11/28/2020,Bill,,Hicks Hardware,,250,,250.0
438,Miscellaneous,,12/20/2020,Bill,,Tim Philip Masonry,,666,,916.0
439,Miscellaneous,,12/28/2020,Bill,,Brosnahan Insurance Agency,,2000,,2916.0
441,Not Specified,,,,,,,,,


In [27]:
ledger_df[
(ledger_df['Account'] == 'Landscaping Services') &
(ledger_df['SubAccount'] == 'Fountains and Garden Lighting')
]

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
273,Landscaping Services,Fountains and Garden Lighting,,,,,,,,


Or re-sort the table by any of its columns:

In [29]:
ledger_df.sort_values('Credit', ascending=False).head(10)

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
222,Notes Payable,,01/01/2021,Journal Entry,,,Opening Balance,,25000.0,25000.0
226,Opening Balance Equity,,12/19/2020,Journal Entry,,,Opening Balance,,13495.0,18495.0
225,Opening Balance Equity,,08/21/2020,Deposit,,,,,5000.0,5000.0
219,Loan Payable,,01/01/2021,Journal Entry,,,Opening Balance,,4000.0,4000.0
162,Accounts Payable (A/P),,12/28/2020,Bill,,Brosnahan Insurance Agency,Opening Balance,,2000.0,4213.62
28,Checking,,01/01/2021,Bill Payment (Check),1.0,Brosnahan Insurance Agency,,,2000.0,1852.51
293,Landscaping Services,,12/23/2020,Invoice,1004.0,Cool Cars,Sod,,1750.0,2006.89
97,Accounts Receivable (A/R),,01/04/2021,Payment,,Cool Cars,,,1675.52,4967.24
157,Accounts Payable (A/P),,12/19/2020,Bill,,Hall Properties,,,900.0,1157.03
22,Checking,,12/28/2020,Bill Payment (Check),11.0,Hall Properties,,,900.0,3058.61


Or compute summary values for any account in the table:

In [31]:
ledger_df[ledger_df['Account'] == 'Utilities']

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
426,Utilities,,,,,,,,,
427,Utilities,Gas and Electric,,,,,,,,
428,Utilities,,11/20/2020,Bill,,PG&E,,86.44,,86.44
429,Utilities,,12/19/2020,Bill,,PG&E,,114.09,,200.53
431,Utilities,Telephone,,,,,,,,
432,Utilities,,11/19/2020,Bill,,Cal Telephone,,56.5,,56.5
433,Utilities,,12/19/2020,Bill,,Cal Telephone,Monthly Phone Bill,74.36,,130.86


In [32]:
ledger_df[ledger_df['Account'] == 'Utilities']['Debit']

426       NaN
427       NaN
428     86.44
429    114.09
431       NaN
432      56.5
433     74.36
Name: Debit, dtype: object

In [33]:
ledger_df[ledger_df['Account'] == 'Utilities']['Debit'].sum()

331.39

We can also save the scrubbed GL back to an Excel file that is much easier to work with by running:

In [35]:
ledger_df.to_excel('data/Clean QuickBooks GL.xlsx', index=False)

In [36]:
ledger_df.head()

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
0,Checking,,,,,,,,,
1,Checking,,08/21/2020,Deposit,,,Opening Balance,5000.0,,5000.0
2,Checking,,10/02/2020,Bill Payment (Check),10.0,Robertson & Associates,,,300.0,4700.0
3,Checking,,10/10/2020,Payment,1053.0,Bill's Windsurf Shop,,175.0,,4875.0
4,Checking,,10/24/2020,Expense,12.0,Robertson & Associates,,,250.0,4625.0


In [37]:
ledger_df = pd.read_excel('data/QuickBooks GL.xlsx', skiprows=4)  # Read the Excel file and skip the first 4 rows to clean up the header

ledger_df = ledger_df.dropna(how='all', axis='columns')                             # Drop columns that have all missing (NaN) values
ledger_df = ledger_df.dropna(how='all', axis='rows')                                # Drop rows that have all missing (NaN) values
ledger_df = ledger_df.rename(columns={'Acct': 'Account', 'SubAcct': 'SubAccount'})  # Rename columns for clarity
ledger_df['Account'] = ledger_df['Account'].ffill()                                 # Forward fill missing values in the 'Account' column
 
# Function to check if a row contains a subtotal based on the presence of 'Total' in the name
def is_subtotal(name):  
    return pd.notna(name) and 'Total' in name

valid_accounts = [name for name in ledger_df['Account'].unique()        # Get valid 'Account' names excluding those containing 'Total'
    if not is_subtotal(name)]
valid_subaccounts = [name for name in ledger_df['SubAccount'].unique()  # Get valid 'SubAccount' names excluding those containing 'Total'
    if not is_subtotal(name)]

ledger_df = ledger_df[ledger_df['Account'].isin(valid_accounts)]        # Filter the DataFrame to keep rows with valid 'Account' values
ledger_df = ledger_df[ledger_df['SubAccount'].isin(valid_subaccounts)]  # Filter the DataFrame to keep rows with valid 'SubAccount' values

# Ensure 'SubAccount' is of a compatible data type (e.g., string or object)
ledger_df['SubAccount'] = ledger_df['SubAccount'].astype(str)   # Convert to string type before applying ffill

# Now apply forward fill (ffill) in a loop
for account in ledger_df['Account'].unique():
    ledger_df.loc[ledger_df['Account'] == account, 'SubAccount'] = ledger_df.loc[
        ledger_df['Account'] == account, 'SubAccount'].ffill()  # Apply forward fill (ffill) to 'SubAccount' for the current 'Account'

# Optionally, infer the object dtype after filling
ledger_df = ledger_df.infer_objects()                           # Ensure the columns have the correct dtype after the operation

ledger_df = ledger_df.dropna(subset=ledger_df.columns[2:], how='all')   # Drop rows where all columns from the 3rd column onward are NaN

ledger_df.head()

Unnamed: 0,Account,SubAccount,Date,Transaction Type,Num,Name,Memo/Description,Debit,Credit,Balance
1,Checking,,08/21/2020,Deposit,,,Opening Balance,5000.0,,5000.0
2,Checking,,10/02/2020,Bill Payment (Check),10.0,Robertson & Associates,,,300.0,4700.0
3,Checking,,10/10/2020,Payment,1053.0,Bill's Windsurf Shop,,175.0,,4875.0
4,Checking,,10/24/2020,Expense,12.0,Robertson & Associates,,,250.0,4625.0
5,Checking,,11/15/2020,Check,4.0,Chin's Gas and Oil,,,54.55,4570.45
