In [151]:
# Numerical and Data Manipulation
import numpy as np  
import pandas as pd  

# Data Visualization
import matplotlib.pyplot as plt  
import seaborn as sns  

# Machine Learning Libraries
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import StandardScaler, MinMaxScaler  
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report  

In [152]:
data = pd.read_excel('expense_category_data.xlsx')

In [153]:
data

Unnamed: 0,category,expenses
0,Utilities,"Electricity, water, gas, internet, cable TV, p..."
1,Food & Groceries,"Fresh produce, meat, dairy products, bread, eg..."
2,Transportation,"Car payment, car insurance, gas/fuel, public t..."
3,Health & Wellness,"Health insurance premiums, prescription medica..."
4,Entertainment,"Streaming services, movie tickets, concert tic..."


In [154]:
data.set_index('category', inplace=True)

In [155]:
df = pd.DataFrame(columns=['expense', 'category'])

for category, row in data.iterrows():
    expenses = row['expenses'].split(',')  # Directly access the column
    for expense in expenses:
        df.loc[len(df)] = [expense.lower(), category]
    


In [156]:
df.sample(5)

Unnamed: 0,expense,category
128,parking permits,Transportation
141,dash cam,Transportation
7,trash collection,Utilities
151,vehicle towing,Transportation
83,nuts,Food & Groceries


In [157]:
df['category'].value_counts()

category
Food & Groceries     54
Entertainment        53
Health & Wellness    51
Utilities            50
Transportation       50
Name: count, dtype: int64

In [158]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(df['expense'])
y = df['category']

In [159]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X, y)
model.score(X, y)

0.9651162790697675

In [160]:
from sklearn.metrics import classification_report

y_pred = model.predict(X)
report = classification_report(y, y_pred)
print(report)

                   precision    recall  f1-score   support

    Entertainment       0.98      0.98      0.98        53
 Food & Groceries       0.98      0.98      0.98        54
Health & Wellness       0.98      0.96      0.97        51
   Transportation       0.98      0.92      0.95        50
        Utilities       0.91      0.98      0.94        50

         accuracy                           0.97       258
        macro avg       0.97      0.96      0.96       258
     weighted avg       0.97      0.97      0.97       258



In [161]:
from joblib import dump, load

dump(model, 'model.joblib')
dump(count_vectorizer, "count_vectorizer.joblib")

['count_vectorizer.joblib']

In [162]:
x = count_vectorizer.transform(['movie'])
category = model.predict(x)
print(category[0])

Entertainment
