In [1]:
# ! this is for the initial dataset creation and testing. later update csv and train the model from that
import pandas as pd

data = {
    "description": [
        "Uber ride to office",
        "Pizza and coke",
        "Monthly rent",
        "Netflix subscription",
        "Grocery shopping at Walmart",
        "Electricity bill",
        "Bus ticket",
        "Amazon shopping",
        "Movie ticket",
        "Water bill",
        "Bought KFC chicken",
        "Train to work",
        "Paid internet bill",
        "New shoes from Nike",
        "Dinner at McDonald's",
        # income examples
        "Salary credited for August",
        "Freelance payment from client",
        "Sold old bike - received ₹5000",
        "Dividend received INR 2000",
        "Refund from Amazon $15"
    ],
    "category": [
        "Transport","Food","Rent","Entertainment","Food","Bills","Transport","Shopping","Entertainment","Bills",
        "Food","Transport","Bills","Shopping","Food",
        # income categories
        "Salary","Freelance","Sale","Investment","Refund"
    ],
    "type": [
        "Expense","Expense","Expense","Expense","Expense","Expense","Expense","Expense","Expense","Expense",
        "Expense","Expense","Expense","Expense","Expense",
        "Income","Income","Income","Income","Income"
    ]
}

df = pd.DataFrame(data)
df.to_csv("expenses_income_dataset.csv", index=False)
df.head()

Unnamed: 0,description,category,type
0,Uber ride to office,Transport,Expense
1,Pizza and coke,Food,Expense
2,Monthly rent,Rent,Expense
3,Netflix subscription,Entertainment,Expense
4,Grocery shopping at Walmart,Food,Expense


In [2]:
# --- Step 2: Prepare spaCy-format training data for two separate models ---
import random
df = pd.read_csv("expenses_income_dataset.csv")

type_labels = ["Expense", "Income"]
expense_cats = ["Food", "Transport", "Rent", "Entertainment", "Shopping", "Bills"]
income_cats = ["Salary", "Freelance", "Sale", "Investment", "Refund"]
all_cats = expense_cats + income_cats

# prepare training lists: (text, {"cats": {...}})
train_data_type = []
train_data_cat = []
for _, row in df.iterrows():
    txt = row["description"]
    # type annotation (single-label)
    t_cats = {lab: 1 if row["type"] == lab else 0 for lab in type_labels}
    train_data_type.append((txt, {"cats": t_cats}))
    # category annotation (single-label across all categories)
    c_cats = {cat: 1 if row["category"] == cat else 0 for cat in all_cats}
    train_data_cat.append((txt, {"cats": c_cats}))

# quick shuffle
combined = list(zip(train_data_type, train_data_cat))
random.shuffle(combined)
train_data_type, train_data_cat = zip(*combined)
train_data_type = list(train_data_type)
train_data_cat = list(train_data_cat)

print("Example (type):", train_data_type[:2])
print("Example (category):", train_data_cat[:2])


Example (type): [('Salary credited for August', {'cats': {'Expense': 0, 'Income': 1}}), ('Paid internet bill', {'cats': {'Expense': 1, 'Income': 0}})]
Example (category): [('Salary credited for August', {'cats': {'Food': 0, 'Transport': 0, 'Rent': 0, 'Entertainment': 0, 'Shopping': 0, 'Bills': 0, 'Salary': 1, 'Freelance': 0, 'Sale': 0, 'Investment': 0, 'Refund': 0}}), ('Paid internet bill', {'cats': {'Food': 0, 'Transport': 0, 'Rent': 0, 'Entertainment': 0, 'Shopping': 0, 'Bills': 1, 'Salary': 0, 'Freelance': 0, 'Sale': 0, 'Investment': 0, 'Refund': 0}})]


In [3]:
# --- Step 3: Train two spaCy text categorizer models (type and category) ---
import spacy
from spacy.util import minibatch
from spacy.training import Example

def train_textcat_spacy(train_data, labels, model_name, n_iter=15):
    nlp = spacy.blank("en")
    # Use factory defaults for spaCy v3; don't pass unsupported top-level keys
    textcat = nlp.add_pipe("textcat")  # removed config={"exclusive_classes": True, "architecture": "simple_cnn"}
    for lb in labels:
        textcat.add_label(lb)

    optimizer = nlp.begin_training()
    for epoch in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=4)
        for batch in batches:
            examples = []
            for text, ann in batch:
                doc = nlp.make_doc(text)
                examples.append(Example.from_dict(doc, ann))
            nlp.update(examples, sgd=optimizer, losses=losses)
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(f"{model_name} - Epoch {epoch+1}/{n_iter} - Losses: {losses}")
    nlp.to_disk("../../../models/"+model_name)
    return nlp

# Train type model
nlp_type = train_textcat_spacy(train_data_type, type_labels, "expense_income_type", n_iter=12)

# Train category model (all categories)
nlp_cat = train_textcat_spacy(train_data_cat, all_cats, "expense_income_category", n_iter=20)


expense_income_type - Epoch 1/12 - Losses: {'textcat': 1.2141835391521454}
expense_income_type - Epoch 5/12 - Losses: {'textcat': 0.0017439418315916555}
expense_income_type - Epoch 10/12 - Losses: {'textcat': 1.5532534680673393e-07}
expense_income_category - Epoch 1/20 - Losses: {'textcat': 0.41283443570137024}
expense_income_category - Epoch 5/20 - Losses: {'textcat': 0.20674003288149834}
expense_income_category - Epoch 10/20 - Losses: {'textcat': 0.007174970407504588}
expense_income_category - Epoch 15/20 - Losses: {'textcat': 2.4116187461231675e-05}
expense_income_category - Epoch 20/20 - Losses: {'textcat': 1.2712395118796849e-06}


In [5]:
# --- Step 4: Load models and test combined prediction with money extraction ---
import re
# load models (if running later)
nlp_type = spacy.load("../../expense_income_type")
nlp_cat = spacy.load("../../expense_income_category")

money_re = re.compile(r"(?i)\b(?:rs\.?|inr|₹|\$|usd)\s?[0-9][0-9,]*(?:\.[0-9]+)?\b")

def extract_money(text):
    m = money_re.search(text)
    if m:
        return m.group(0)
    # optional: use spaCy NER if you load en_core_web_sm
    try:
        ner = spacy.load("en_core_web_sm")
        doc = ner(text)
        for ent in doc.ents:
            if ent.label_.upper() == "MONEY":
                return ent.text
    except Exception:
        pass
    return None

def predict_type_and_category(text):
    dt = nlp_type(text)
    predicted_type = max(dt.cats, key=dt.cats.get)
    dc = nlp_cat(text)
    predicted_cat = max(dc.cats, key=dc.cats.get)
    amount = extract_money(text)
    # return {"type": predicted_type, "category": predicted_cat, "amount": amount, "scores": {"type": dt.cats, "category": dc.cats}}
    return {"type": predicted_type, "category": predicted_cat, "amount": amount}

# Tests
print(predict_type_and_category("Rs.1000 Uber ride to office"))   # expect Income/Expense -> Transport
print(predict_type_and_category("Salary credited for August"))    # expect Income -> Salary
print(predict_type_and_category("Bought groceries at Walmart for Rs 2,500"))  # expect Expense -> Food
print(predict_type_and_category("Refund from Amazon $15"))        # expect Income -> Refund

{'type': 'Expense', 'category': 'Transport', 'amount': 'Rs.1000'}
{'type': 'Income', 'category': 'Salary', 'amount': None}
{'type': 'Expense', 'category': 'Food', 'amount': 'Rs 2,500'}
{'type': 'Income', 'category': 'Refund', 'amount': '15'}
