In [4]:
import pandas as pd
import numpy as np
import joblib
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [5]:
# Path to your final cleaned dataset
DATA_PATH = "dataset.csv"

# Input and label columns
TEXT_COL = "text"
LABEL_COL = "broad_category"

df = pd.read_csv(DATA_PATH)

print("Columns:", df.columns)
print(df.head())

# Clean text
df[TEXT_COL] = df[TEXT_COL].astype(str).str.lower().str.strip()
df[LABEL_COL] = df[LABEL_COL].astype(str).str.strip()


Columns: Index(['date', 'text', 'amount', 'broad_category'], dtype='object')
                        date        text  amount broad_category
0  2022-07-06 05:57:10 +0000  Restuarant    5.50           Food
1  2022-07-06 05:57:27 +0000      Market    2.00          Other
2  2022-07-06 05:58:12 +0000       Coffe   30.10           Food
3  2022-07-06 05:58:25 +0000      Market   17.33          Other
4  2022-07-06 05:59:00 +0000  Restuarant    5.50           Food


In [6]:
templates = {
    "transport": [
        "uber ride to office",
        "ola cab to airport",
        "rapido bike taxi ride",
        "delhi metro card recharge",
        "local train ticket",
        "uber auto ride",
        "fastag toll recharge"
    ],
    "food": [
        "zomato food order",
        "swiggy lunch",
        "dominos pizza order",
        "mcdonalds meal",
        "starbucks coffee",
        "ccd cafe coffee day"
    ],
    "shopping": [
        "amazon shopping order",
        "flipkart electronics purchase",
        "myntra clothing order",
        "ajio fashion shopping",
        "h&m apparel store",
        "zara clothing mall"
    ],
    "utilities": [
        "airtel postpaid bill",
        "jio fiber internet bill",
        "bses electricity bill payment",
        "water bill payment",
        "png gas bill"
    ],
    "fuel": [
        "hp petrol pump",
        "indian oil fuel station",
        "bpcl petrol filling"
    ],
    "entertainment": [
        "netflix subscription",
        "spotify premium payment",
        "bookmyshow movie ticket",
        "youtube premium subscription"
    ],
    "medical": [
        "apollo pharmacy",
        "medplus medical store",
        "digital health lab test billing"
    ],
    "rent": [
        "monthly house rent",
        "pg rent payment",
        "flat rent via upi"
    ],
    "other": [
        "atm cash withdrawal",
        "bank charge fee",
        "miscellaneous expense"
    ]
}

TARGET_SYNTH = 5000

synthetic_rows = []
for _ in range(TARGET_SYNTH):
    cat = random.choice(list(templates.keys()))
    text = random.choice(templates[cat])
    amt = round(random.uniform(50, 5000), 2)
    synthetic_rows.append([text, amt, cat])

df_synth = pd.DataFrame(synthetic_rows, columns=[TEXT_COL, "amount", LABEL_COL])
print("Synthetic dataset:", df_synth.shape)


Synthetic dataset: (5000, 3)


In [7]:
df_full = pd.concat([df, df_synth], ignore_index=True)
df_full = df_full.sample(frac=1, random_state=42).reset_index(drop=True)

print("Final training dataset size:", df_full.shape)


Final training dataset size: (9597, 4)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df_full[TEXT_COL],
    df_full[LABEL_COL],
    test_size=0.2,
    random_state=42,
    stratify=df_full[LABEL_COL]
)


In [9]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

# Evaluation
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))


               precision    recall  f1-score   support

         Food       1.00      1.00      1.00       357
        Other       1.00      1.00      1.00       405
     Shopping       1.00      1.00      1.00        10
       Sports       1.00      1.00      1.00         6
    Transport       1.00      1.00      1.00       106
    Utilities       1.00      1.00      1.00        35
entertainment       1.00      1.00      1.00       110
         food       1.00      1.00      1.00       113
         fuel       1.00      1.00      1.00       115
      medical       1.00      1.00      1.00       116
        other       1.00      1.00      1.00       115
         rent       1.00      1.00      1.00       106
     shopping       1.00      1.00      1.00       108
    transport       1.00      1.00      1.00       113
    utilities       1.00      1.00      1.00       105

     accuracy                           1.00      1920
    macro avg       1.00      1.00      1.00      1920
 weighte

In [10]:
joblib.dump(
    {
        "vectorizer": vectorizer,
        "model": clf,
        "text_col": TEXT_COL
    },
    "budget_category_model.joblib"
)

print("Model saved as 'budget_category_model.joblib'")


Model saved as 'budget_category_model.joblib'
