In [56]:
# ! pip install spacy
# ! pip install pandas

Prepare spaCy-format training data for two separate models

In [57]:
import random
import pandas as pd

# Load the dataset (CSV file must have columns like: description, type, category)
df = pd.read_csv("expenses_income_dataset.csv")


In [58]:
# Define possible labels for the "type" classification (binary: Expense or Income)
type_labels = ["Expense", "Income"]
# Define possible categories for "Expense" type
expense_cats = ["Food", "Transport", "Rent", "Entertainment", "Shopping", "Bills"]
# Define possible categories for "Income" type
income_cats = ["Salary", "Freelance", "Sale", "Investment", "Refund"]

# Combine expense and income categories into one list
all_cats = expense_cats + income_cats


In [59]:
# Prepare training lists
train_data_type = []   # For classifying whether text is "Expense" or "Income"
train_data_cat = []    # For classifying into detailed categories

In [60]:
for _, row in df.iterrows():
    txt = row["description"]
    t_cats = {lab: 1 if row["type"] == lab else 0 for lab in type_labels}
    train_data_type.append((txt, {"cats": t_cats}))
    c_cats = {cat: 1 if row["category"] == cat else 0 for cat in all_cats}
    train_data_cat.append((txt, {"cats": c_cats}))


In [61]:
combined = list(zip(train_data_type, train_data_cat))
random.shuffle(combined)
train_data_type, train_data_cat = zip(*combined)
train_data_type = list(train_data_type)
train_data_cat = list(train_data_cat)

In [62]:
print("Example (type):", train_data_type[:2])
print("Example (category):", train_data_cat[:2])


Example (type): [('Rs. 3000 for a gaming subscription', {'cats': {'Expense': 1, 'Income': 0}}), ('Bus ticket for Rs. 160', {'cats': {'Expense': 1, 'Income': 0}})]
Example (category): [('Rs. 3000 for a gaming subscription', {'cats': {'Food': 0, 'Transport': 0, 'Rent': 0, 'Entertainment': 1, 'Shopping': 0, 'Bills': 0, 'Salary': 0, 'Freelance': 0, 'Sale': 0, 'Investment': 0, 'Refund': 0}}), ('Bus ticket for Rs. 160', {'cats': {'Food': 0, 'Transport': 1, 'Rent': 0, 'Entertainment': 0, 'Shopping': 0, 'Bills': 0, 'Salary': 0, 'Freelance': 0, 'Sale': 0, 'Investment': 0, 'Refund': 0}})]


Train two spaCy text categorizer models (type and category)

In [63]:
#Imports required for spaCy textcat training
import spacy
from spacy.util import minibatch
from spacy.training import Example

In [64]:
#Training function signature & defaults
def train_textcat_spacy(train_data, labels, model_name, n_iter=20):
    nlp = spacy.blank("en")

    #Using factory defaults for spaCy v3; avoid deprecated top-level config.
    textcat = nlp.add_pipe("textcat") #Revoved config={"exclusive_clasess":True, "architecture": "simple_cnn"}

    for lb in labels:
        textcat.add_label(lb)

    #Initialize optimizer 
    optimizer = nlp.begin_training()

    for epoch in range(n_iter):
        random.shuffle(train_data)   #Avoid order bias each epoch
        losses = {}
        batches = minibatch(train_data, size=4)

        #Small, fixed-size batches for stable updates
        for batch in batches:
            examples = []
            for text, ann in batch:
                doc = nlp.make_doc(text)
                examples.append(Example.from_dict(doc, ann))
            #Single optimizer step on this batch
            nlp.update(examples, sgd=optimizer, losses=losses)

        #Log at first epoch and then every 5 epochs
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"{model_name} - Epoch {epoch+1}/{n_iter} - Losses: {losses}")

    #Save to disk & return trained pipeline 
    nlp.to_disk("../../../models/" + model_name)
    return nlp


In [65]:
#Train type model
#Requires: train_data_type (list), type_labels (list)
nlp_type = train_textcat_spacy(
    train_data_type,
    type_labels,
    "expense_income_type",
    n_iter=12
)

expense_income_type - Epoch 1/12 - Losses: {'textcat': 3.1950512100676605}
expense_income_type - Epoch 5/12 - Losses: {'textcat': 1.8456034747704742e-07}
expense_income_type - Epoch 10/12 - Losses: {'textcat': 2.2904514066640624e-08}


In [66]:
#Train category model (all categories)
#Requires: train_data_cat (list), all_cats (list)
nlp_cat = train_textcat_spacy(
    train_data_cat,
    all_cats,
    "expense_income_category",
    n_iter=20
)

expense_income_category - Epoch 1/20 - Losses: {'textcat': 3.465250361710787}
expense_income_category - Epoch 5/20 - Losses: {'textcat': 0.00671335164552378}
expense_income_category - Epoch 10/20 - Losses: {'textcat': 3.9909270621585335e-07}
expense_income_category - Epoch 15/20 - Losses: {'textcat': 1.0014662169682831e-07}
expense_income_category - Epoch 20/20 - Losses: {'textcat': 3.870965425284423e-08}


In [67]:
# for testing

import re
# load models (if running later)
nlp_type = spacy.load("../../expense_income_type")
nlp_cat = spacy.load("../../expense_income_category")

money_re = re.compile(r"(?i)\b(?:rs\.?|inr|₹|\$|usd)\s?[0-9][0-9,]*(?:\.[0-9]+)?\b")

def extract_money(text):
    m = money_re.search(text)
    if m:
        return m.group(0)
    # optional: use spaCy NER if you load en_core_web_sm
    try:
        ner = spacy.load("en_core_web_sm")
        doc = ner(text)
        for ent in doc.ents:
            if ent.label_.upper() == "MONEY":
                return ent.text
    except Exception:
        pass
    return None

def predict_type_and_category(text):
    dt = nlp_type(text)
    predicted_type = max(dt.cats, key=dt.cats.get)
    dc = nlp_cat(text)
    predicted_cat = max(dc.cats, key=dc.cats.get)
    amount = extract_money(text)
    # return {"type": predicted_type, "category": predicted_cat, "amount": amount, "scores": {"type": dt.cats, "category": dc.cats}}
    return {"type": predicted_type, "category": predicted_cat, "amount": amount}

# Tests
print(predict_type_and_category("Rs.1000 Uber ride to office"))   # expect Income/Expense -> Transport
print(predict_type_and_category("Salary credited for August"))    # expect Income -> Salary
print(predict_type_and_category("Bought groceries at Walmart for Rs 2,500"))  # expect Expense -> Food
print(predict_type_and_category("Refund from Amazon $15"))        # expect Income -> Refund

{'type': 'Expense', 'category': 'Transport', 'amount': 'Rs.1000'}
{'type': 'Expense', 'category': 'Food', 'amount': None}
{'type': 'Expense', 'category': 'Food', 'amount': 'Rs 2,500'}
{'type': 'Income', 'category': 'Refund', 'amount': '15'}


In [68]:
print(predict_type_and_category("Bought groceries for Rs 2,500")) 

{'type': 'Expense', 'category': 'Food', 'amount': 'Rs 2,500'}


In [69]:
print(predict_type_and_category("rs 5000 on uber ride")) 

{'type': 'Expense', 'category': 'Transport', 'amount': 'rs 5000'}
