## Importing Libraries for Data Manipulation and Visualization

In [None]:
import pandas as pd
import random

# Possible transaction data i could think of
transaction_data = [
    {"description": "Supermarket", "category": "Groceries"},
    {"description": "Fast Food", "category": "Dining Out"},
    {"description": "Water Bill", "category": "Utilities"},
    {"description": "Cafe", "category": "Dining Out"},
    {"description": "Petrol Station", "category": "Transportation"},
    {"description": "Bookstore", "category": "Shopping"},
    {"description": "Pharmacy", "category": "Health & Wellness"},
    {"description": "Movie Tickets", "category": "Entertainment"},
    {"description": "Flight Tickets", "category": "Travel"},
    {"description": "Textbook Purchase", "category": "Education"},
    {"description": "Bank Fee", "category": "Financial Services"},
    {"description": "Gardening Supplies", "category": "Home & Garden"},
    {"description": "Charity Donation", "category": "Donations & Charity"},
    {"description": "Manicure", "category": "Personal Care"},{"description": "Haircut", "category": "Personal Care"},
    {"description": "Drugs", "category": "Health & Wellness"},
    {"description": "Golf Equipment", "category": "Hobbies & Leisure"},
    {"description": "Car Repair", "category": "Automotive"},
    {"description": "Home Insurance", "category": "Insurance"},
    {"description": "Property Tax", "category": "Taxes"},
    {"description": "Diapers", "category": "Baby & Childcare"},
    {"description": "Birthday Gift", "category": "Gifts & Special Occasions"},
    {"description": "Jewelry Purchase", "category": "Jewelry & Accessories"},
    {"description": "Furniture", "category": "Furniture"},
    {"description": "Internet Bill", "category": "Subscription"},
    {"description": "Plumbing Service", "category": "Home Services"},
    {"description": "Magazine Subscription", "category": "Subscriptions"},
    {"description": "Beach Vacation", "category": "Vacation & Travel"},
    {"description": "Spa Treatment", "category": "Beauty & Grooming"},
    {"description": "Gym Membership", "category": "Sports & Fitness"},
    {"description": "Kitchen Renovation", "category": "DIY & Renovations"},
    {"description": "Office Supplies", "category": "Office & Work Supplies"},
    # we can add more transactions here
]

# Increasing transaction data to reach 5000 rows
extended_transaction_data = transaction_data * (5000 // len(transaction_data))

# Converting to Dataframe
df = pd.DataFrame(extended_transaction_data)

# Csv File
df.to_csv("transaction_data_extended.csv", index=False)


In [None]:
df.columns = df.columns.str.replace('description', 'Description')
df.columns = df.columns.str.replace('category', 'Category')

In [None]:
df.head()

Unnamed: 0,Description,Category
0,Supermarket,Groceries
1,Fast Food,Dining Out
2,Water Bill,Utilities
3,Cafe,Dining Out
4,Petrol Station,Transportation


In [None]:
df.size

9984

In [None]:
Anoda =df.copy()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix


In [None]:


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Description'], df['Category'], test_size=0.2, random_state=42)

In [None]:
# Convert the text data into numerical vectors
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
# Train the Multinomial Naive Bayes classifier
clf = MultinomialNB().fit(X_train_tfidf, y_train)


In [None]:
# Make predictions on the test set
y_pred = clf.predict(X_test_tfidf)

In [None]:
# Print the classification report
print(classification_report(y_test, y_pred))

                           precision    recall  f1-score   support

               Automotive       1.00      1.00      1.00        35
         Baby & Childcare       1.00      1.00      1.00        26
        Beauty & Grooming       1.00      1.00      1.00        28
        DIY & Renovations       1.00      1.00      1.00        28
               Dining Out       1.00      1.00      1.00        63
      Donations & Charity       1.00      1.00      1.00        35
                Education       1.00      1.00      1.00        44
            Entertainment       1.00      1.00      1.00        28
       Financial Services       1.00      1.00      1.00        31
                Furniture       1.00      1.00      1.00        35
Gifts & Special Occasions       1.00      1.00      1.00        29
                Groceries       1.00      1.00      1.00        28
        Health & Wellness       1.00      1.00      1.00        75
        Hobbies & Leisure       1.00      1.00      1.00     

In [None]:
import joblib
joblib.dump(clf, 'Expense_Categorization_model.pkl')

['Expense_Categorization_model.pkl']

In [None]:
# Test the model with a new expense description
new_expense = "Hair and Nails"
new_expense_tfidf = vectorizer.transform([new_expense])
predicted_category = clf.predict(new_expense_tfidf)
print(f"Predicted category for '{new_expense}': {predicted_category[0]}")

Predicted category for 'Hair and Nails': Personal Care
