In [2]:
# ! this is for the initial dataset creation and testing. later update csv and train the model from that
import pandas as pd

data = {
    "description": [
        "Uber ride to office",
        "Pizza and coke",
        "Monthly rent",
        "Netflix subscription",
        "Grocery shopping at Walmart",
        "Electricity bill",
        "Bus ticket",
        "Amazon shopping",
        "Movie ticket",
        "Water bill",
        "Bought KFC chicken",
        "Train to work",
        "Paid internet bill",
        "New shoes from Nike",
        "Dinner at McDonald's"
    ],
    "category": [
        "Transport",
        "Food",
        "Rent",
        "Entertainment",
        "Food",
        "Bills",
        "Transport",
        "Shopping",
        "Entertainment",
        "Bills",
        "Food",
        "Transport",
        "Bills",
        "Shopping",
        "Food"
    ]
}

df = pd.DataFrame(data)
df.to_csv("expenses_dataset.csv", index=False)
df.head()

Unnamed: 0,description,category
0,Uber ride to office,Transport
1,Pizza and coke,Food
2,Monthly rent,Rent
3,Netflix subscription,Entertainment
4,Grocery shopping at Walmart,Food


In [3]:
# import scv for training
import pandas as pd
df = pd.read_csv("expenses_dataset.csv")
categories = ["Food", "Transport", "Rent", "Entertainment", "Shopping", "Bills", "Other"]

train_data = []
for _, row in df.iterrows():
    cats = {cat: 0 for cat in categories} # Initialize category counts
    cats[row["category"]] = 1 # Set the active category to 1
    train_data.append((row["description"], {"cats": cats})) # Append the training data
    
# print(train_data)

In [4]:
# train spacy text categorizer
import spacy
from spacy.util import minibatch 
from spacy.training import Example # this is for creating training examples


In [5]:
nlp = spacy.blank("en")  # Create a new blank English model

# add text categorizer to the pipeline
# textcat is the text categorizer component
if (
    "textcat" not in nlp.pipe_names
):  # Check if the textcat component is already in the pipeline
    textcat = nlp.add_pipe("textcat")
else:
    textcat = nlp.get_pipe("textcat")

# add labels from aboue categories list
for cat in categories:
    textcat.add_label(cat)

# initialize training
optimizer = nlp.begin_training()

EPOCHS = 10
# training loop
for epoch in range(EPOCHS):
    losses = {}  # losses dictionary to hold the loss values
    batches = minibatch(train_data, size=4)  # train data in batches
    for batch in batches:
        examples = []  # Create a list to hold the examples
        for text, anotation in batch:  # Iterate over each text and its annotations
            doc = nlp.make_doc(text)
            examples.append(
                Example.from_dict(doc, anotation)
            )  # Create an Example object and append it to the list
        nlp.update(examples, sgd=optimizer, losses=losses)
    print(f"Epoch {epoch+1}/{EPOCHS}, Losses: {losses}")

Epoch 1/10, Losses: {'textcat': 0.4893414154648781}
Epoch 2/10, Losses: {'textcat': 0.4566032886505127}
Epoch 3/10, Losses: {'textcat': 0.3968615457415581}
Epoch 4/10, Losses: {'textcat': 0.3060327172279358}
Epoch 5/10, Losses: {'textcat': 0.19159800186753273}
Epoch 6/10, Losses: {'textcat': 0.09054321423172951}
Epoch 7/10, Losses: {'textcat': 0.033404706977307796}
Epoch 8/10, Losses: {'textcat': 0.010826966143213212}
Epoch 9/10, Losses: {'textcat': 0.003239445301005617}
Epoch 10/10, Losses: {'textcat': 0.0009283485342166387}


In [None]:
nlp.to_disk("../../../models/expense_categorizer_v0")  # Save the model to disk

In [2]:
!pip list

Package                   Version
------------------------- --------------
absl-py                   2.3.1
altair                    5.5.0
annotated-types           0.7.0
anyio                     4.10.0
argon2-cffi               25.1.0
argon2-cffi-bindings      25.1.0
arrow                     1.3.0
asttokens                 3.0.0
astunparse                1.6.3
async-lru                 2.0.5
attrs                     25.3.0
babel                     2.17.0
beautifulsoup4            4.13.4
bleach                    6.2.0
blinker                   1.9.0
blis                      1.3.0
cachetools                6.1.0
catalogue                 2.0.10
certifi                   2025.8.3
cffi                      1.17.1
charset-normalizer        3.4.3
click                     8.2.1
cloudpathlib              0.21.1
colorama                  0.4.6
comm                      0.2.3
confection                0.1.5
contourpy                 1.3.3
cycler                    0.12.1
cymem           

In [1]:
from pathlib import Path
Path.cwd()

WindowsPath('c:/Users/pkmpp/OneDrive/Documents/y3s1/project/streamlit-ai-chat-app-with-ollama/models/train/Expense Categorizer')