In [None]:
! pip install spacy
! pip install pandas

Prepare spaCy-format training data for two separate models

In [None]:
import random
import pandas as pd

# Load the dataset (CSV file must have columns like: description, type, category)
df = pd.read_csv("expenses_income_dataset.csv")


In [None]:
# Define possible labels for the "type" classification (binary: Expense or Income)
type_labels = ["Expense", "Income"]
# Define possible categories for "Expense" type
expense_cats = ["Food", "Transport", "Rent", "Entertainment", "Shopping", "Bills"]
# Define possible categories for "Income" type
income_cats = ["Salary", "Freelance", "Sale", "Investment", "Refund"]

# Combine expense and income categories into one list
all_cats = expense_cats + income_cats


In [None]:
# Prepare training lists
train_data_type = []   # For classifying whether text is "Expense" or "Income"
train_data_cat = []    # For classifying into detailed categories

In [None]:
for _, row in df.iterrows():
    txt = row["description"]
    t_cats = {lab: 1 if row["type"] == lab else 0 for lab in type_labels}
    train_data_type.append((txt, {"cats": t_cats}))
    c_cats = {cat: 1 if row["category"] == cat else 0 for cat in all_cats}
    train_data_cat.append((txt, {"cats": c_cats}))


In [None]:
combined = list(zip(train_data_type, train_data_cat))
random.shuffle(combined)
train_data_type, train_data_cat = zip(*combined)
train_data_type = list(train_data_type)
train_data_cat = list(train_data_cat)

In [None]:
print("Example (type):", train_data_type[:2])
print("Example (category):", train_data_cat[:2])


Train two spaCy text categorizer models (type and category)

In [None]:
#Imports required for spaCy textcat training
import spacy
from spacy.util import minibatch
from spacy.training import Example

In [None]:
#Training function signature & defaults
def train_textcat_spacy(train_data, labels, model_name, n_iter=15):
    nlp = spacy.blank("en")

    #Using factory defaults for spaCy v3; avoid deprecated top-level config.
    textcat = nlp.add_pipe("textcat") #Revoved config={"exclusive_clasess":True, "architecture": "simple_cnn"}

    for lb in labels:
        textcat.add_label(lb)

    #Initialize optimizer 
    optimizer = nlp.begin_training()

    for epoch in range(n_iter):
        random.shuffle(train_data)   #Avoid order bias each epoch
        losses = {}
        batches = minibatch(train_data, size=4)

        #Small, fixed-size batches for stable updates
        for batch in batches:
            examples = []
            for text, ann in batch:
                doc = nlp.make_doc(text)
                examples.append(Example.from_dict(doc, ann))

            #Single optimizer step on this batch
            nlp.update(examples, sgd=optimizer, losses=losses)

        #Log at first epoch and then every 5 epochs
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"{model_name} - Epoch {epoch+1}/{n_iter} - Losses: {losses}")

    #Save to disk & return trained pipeline 
    nlp.to_disk("../../../../models/" + model_name)
    return nlp
