In [7]:
pip install langchain_community

Note: you may need to restart the kernel to use updated packages.


In [8]:
from langchain_community.llms import Ollama

In [9]:
llm = Ollama(model="llama2")

In [10]:
llm.invoke("Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Spotify AB by Adyen - \
Entertainment, Beta Boulders Ams Amsterdam Nld - Sports, etc.: \
Taxi Utrecht, Ministerie van Justitie en Veiligheid, Etos AMSTERDAM NLD, Bistro Bar Amsterdam")

'\nCertainly! Here are the categories for each of the expenses you provided:\n\n* Spotify AB by Adyen - Entertainment\n* Beta Boulders Ams Amsterdam Nld - Sports\n* Taxi Utrecht - Transportation\n* Ministerie van Justitie en Veiligheid - Government\n* Etos AMSTERDAM NLD - Retail\n* Bistro Bar Amsterdam - Food and Beverage\n\nIs there anything else I can help you with?'

In [11]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [12]:
# Read the transactions_2022_2023.csv file 
import pandas as pd
df = pd.read_csv("transactions_2023_2024.csv")
df.head()

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (EUR)
0,2023-11-04,Salary,Income,4.5
1,2023-11-05,YouTube Revenue,Income,1.2
2,2023-11-06,Anime Streaming Subscriptions,Expense,50.0
3,2023-11-07,Book Purchases,Expense,80.0
4,2023-11-08,Rent,Expense,1.2


In [13]:
# Get unique transactions in the Name / Description column
unique_transactions = df["Name / Description"].unique()
len(unique_transactions)

104

In [14]:
unique_transactions[1:10]

array(['YouTube Revenue', 'Anime Streaming Subscriptions',
       'Book Purchases', 'Rent', 'Travel', 'Gym', 'Transportation',
       'Miscellaneous', 'Gifts/Charity'], dtype=object)

In [15]:
# Get index list
#https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

index_list = list(hop(0, len(unique_transactions), 30))
index_list

[0, 30, 60, 90, 104]

In [16]:
def categorize_transactions(transaction_names, llm):
    response = llm.invoke("Can you add an appropriate category to the following expenses. For example: YouTube Revenue - Business, Anime movie - Anime, etc.. Categories should be less than 4 words. " + transaction_names)
    response = response.split('\n')

    print(response)

    # Put in dataframe
    categories_df = pd.DataFrame({'Transaction vs category': response})
    categories_df[['Transaction', 'Category']] = categories_df['Transaction vs category'].str.split(' - ', expand=True)
    
    return categories_df

In [17]:
# Test out the function
categorize_transactions('YouTube Revenue, Sponsorship Deals, Anime Streaming Subscriptions, Monthly Travel',
                        llm)

['Of course! Here are the expenses you provided, along with an appropriate category for each one:', '', '1. YouTube Revenue - Business', '2. Anime movie - Anime', '3. Gaming console - Tech', '4. Monthly subscription boxes - Hobbies', '5. Sponsorship Deals - Business', '6. Anime streaming subscriptions - Anime', '7. Travel (monthly) - Travel', '', 'I hope this helps! Let me know if you need any further assistance.']


Unnamed: 0,Transaction vs category,Transaction,Category
0,"Of course! Here are the expenses you provided,...","Of course! Here are the expenses you provided,...",
1,,,
2,1. YouTube Revenue - Business,1. YouTube Revenue,Business
3,2. Anime movie - Anime,2. Anime movie,Anime
4,3. Gaming console - Tech,3. Gaming console,Tech
5,4. Monthly subscription boxes - Hobbies,4. Monthly subscription boxes,Hobbies
6,5. Sponsorship Deals - Business,5. Sponsorship Deals,Business
7,6. Anime streaming subscriptions - Anime,6. Anime streaming subscriptions,Anime
8,7. Travel (monthly) - Travel,7. Travel (monthly),Travel
9,,,


In [20]:
# Intialise the categories_df_all dataframe
categories_df_all = pd.DataFrame()

# Loop through the index_list
for i in range(0, len(index_list)-1):
    transaction_names = unique_transactions[index_list[i]:index_list[i+1]]
    transaction_names = ','.join(transaction_names)

    categories_df = categorize_transactions(transaction_names, llm)
    categories_df_all = pd.concat([categories_df_all, categories_df], ignore_index=True)

['Sure! Here are the appropriate categories for each expense:', '', '1. Salary - Income', '2. YouTube Revenue - Business', '3. Anime Streaming Subscriptions - Entertainment', '4. Book Purchases - Entertainment', '5. Rent - Housing', '6. Travel - Transportation', '7. Gym - Health & Fitness', '8. Transportation - Transportation', '9. Miscellaneous - Miscellaneous', '10. Gifts/Charity - Charitable Contributions', '11. Home Office Supplies - Office', '12. Sponsorship Deals - Business', '13. Freelance Tech Projects - Business', '14. Tech Gadgets/Upgrades - Technology', '15. Food Delivery - Food & Beverage', '16. Rent/Mortgage - Housing', '17. Utilities - Utilities', '18. Anime movie - Entertainment', '19. Health & Fitness - Health & Fitness', '20. Software Subscriptions (e.g., for tech tools) - Technology', '21. YouTube Equipment (camera, microphone) - Technology', '22. Anime Merchandise - Shopping', '23. Book Club Membership - Entertainment', '24. Data Plan/Internet - Technology', '25. Con

In [21]:
categories_df_all

Unnamed: 0,Transaction vs category,Transaction,Category
0,Sure! Here are the appropriate categories for ...,Sure! Here are the appropriate categories for ...,
1,,,
2,1. Salary - Income,1. Salary,Income
3,2. YouTube Revenue - Business,2. YouTube Revenue,Business
4,3. Anime Streaming Subscriptions - Entertainment,3. Anime Streaming Subscriptions,Entertainment
...,...,...,...
107,"10. Event Tickets (Concerts, Sports) - Enterta...","10. Event Tickets (Concerts, Sports)",Entertainment
108,11. Seasonal Decorations - Home,11. Seasonal Decorations,Home
109,12. Custom Anime Art - Personal,12. Custom Anime Art,Personal
110,13. Pet Insurance - Pets,13. Pet Insurance,Pets


In [22]:
# Get unique categories in categories_df_all
unique_categories = categories_df_all["Category"].unique()
unique_categories

array([None, 'Income', 'Business', 'Entertainment', 'Housing',
       'Transportation', 'Health & Fitness', 'Miscellaneous',
       'Charitable Contributions', 'Office', 'Technology',
       'Food & Beverage', 'Utilities', 'Shopping', 'Finance',
       'Communication', 'Leisure', 'Fashion', 'Grooming',
       'Financial Planning', 'Online Income', 'Travel', 'Wellness',
       'Protection', 'Learning', 'Career Development', 'Online Marketing',
       'Tools', 'Pets', 'Household', 'Education', 'Online Business',
       'Hobbies', 'Self-Improvement', 'Health and Wellness', 'Home',
       'Adventure', 'Creativity', 'Outdoors', 'Personal', 'Health'],
      dtype=object)

In [23]:
# Drop NA values
categories_df_all = categories_df_all.dropna()

In [24]:
# Remove the numbering eg "1. " from Transaction column
categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'\d+\.\s+', '')
categories_df_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'\d+\.\s+', '')


Unnamed: 0,Transaction vs category,Transaction,Category
2,1. Salary - Income,1. Salary,Income
3,2. YouTube Revenue - Business,2. YouTube Revenue,Business
4,3. Anime Streaming Subscriptions - Entertainment,3. Anime Streaming Subscriptions,Entertainment
5,4. Book Purchases - Entertainment,4. Book Purchases,Entertainment
6,5. Rent - Housing,5. Rent,Housing
...,...,...,...
107,"10. Event Tickets (Concerts, Sports) - Enterta...","10. Event Tickets (Concerts, Sports)",Entertainment
108,11. Seasonal Decorations - Home,11. Seasonal Decorations,Home
109,12. Custom Anime Art - Personal,12. Custom Anime Art,Personal
110,13. Pet Insurance - Pets,13. Pet Insurance,Pets


In [25]:
# Merge the categories_df_all with the transactions_2023_2024.csv dataframe (df)
df = pd.read_csv("transactions_2023_2024.csv")
# Reset the index of both DataFrames to ensure alignment by index
df.reset_index(drop=True, inplace=True)
categories_df_all.reset_index(drop=True, inplace=True)

# Concatenate DataFrames horizontally (axis=1)
df_combined = pd.concat([df, categories_df_all], axis=1)

# Save the combined DataFrame to a new CSV file if needed
df_combined.to_csv("combined_transactions_categories.csv", index=False)

# Display the first few rows of the combined DataFrame
print(df_combined.head())

         Date             Name / Description Expense/Income  Amount (EUR)  \
0  2023-11-04                         Salary         Income           4.5   
1  2023-11-05                YouTube Revenue         Income           1.2   
2  2023-11-06  Anime Streaming Subscriptions        Expense          50.0   
3  2023-11-07                 Book Purchases        Expense          80.0   
4  2023-11-08                           Rent        Expense           1.2   

                            Transaction vs category  \
0                                1. Salary - Income   
1                     2. YouTube Revenue - Business   
2  3. Anime Streaming Subscriptions - Entertainment   
3                 4. Book Purchases - Entertainment   
4                                 5. Rent - Housing   

                        Transaction       Category  
0                         1. Salary         Income  
1                2. YouTube Revenue       Business  
2  3. Anime Streaming Subscriptions  Entertainm