In [44]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os


import plotly.express as px

%matplotlib inline

In [45]:
df = pd.read_csv('myExpenses1.csv')
df.head()

Unnamed: 0,Date,Item,Amount,Category,Time,day
0,1/3/2023,chai,7,alone,7:00,Wednesday
1,1/3/2023,chai,20,friend,10:00,Wednesday
2,1/3/2023,juice,15,friend,13:00,Wednesday
3,1/3/2023,rikshow,12,alone,14:00,Wednesday
4,1/3/2023,coffee,12,alone,15:00,Wednesday


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      145 non-null    object
 1   Item      145 non-null    object
 2   Amount    145 non-null    int64 
 3   Category  144 non-null    object
 4   Time      145 non-null    object
 5   day       145 non-null    object
dtypes: int64(1), object(5)
memory usage: 6.9+ KB


In [47]:
df.describe()

Unnamed: 0,Amount
count,145.0
mean,31.786207
std,56.389541
min,5.0
25%,12.0
50%,17.0
75%,30.0
max,500.0


In [48]:
df.isnull().sum()

Date        0
Item        0
Amount      0
Category    1
Time        0
day         0
dtype: int64

In [49]:
df['Category'].fillna('alone', inplace=True)

In [50]:
# Drop rows where 'Time' is null
df = df.dropna(subset=['Time'])


In [51]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')


In [52]:
df['Hour'] = df['Date'].dt.hour
df['Amount'] = df['Amount']*10

In [53]:
def time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

# Apply the function to create the 'Time_of_Day' column
df['Time_of_Day'] = df['Hour'].apply(time_of_day)

In [54]:
print(df.columns)

Index(['Date', 'Item', 'Amount', 'Category', 'Time', 'day', 'Hour',
       'Time_of_Day'],
      dtype='object')


In [55]:
df = df.drop(columns=['Hour'])

In [56]:
df['Amount_Category'] = pd.cut(df['Amount'], bins=[0, 50, 100, float('inf')], labels=["Low", "Medium", "High"])


In [57]:
df.head()

Unnamed: 0,Date,Item,Amount,Category,Time,day,Time_of_Day,Amount_Category
0,2023-03-01,chai,70,alone,7:00,Wednesday,Night,Medium
1,2023-03-01,chai,200,friend,10:00,Wednesday,Night,High
2,2023-03-01,juice,150,friend,13:00,Wednesday,Night,High
3,2023-03-01,rikshow,120,alone,14:00,Wednesday,Night,High
4,2023-03-01,coffee,120,alone,15:00,Wednesday,Night,High


In [58]:
print(df.columns)

Index(['Date', 'Item', 'Amount', 'Category', 'Time', 'day', 'Time_of_Day',
       'Amount_Category'],
      dtype='object')


In [59]:
df = pd.get_dummies(df, columns=['Category'])


In [60]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Time_of_Day'] = le.fit_transform(df['Time_of_Day'])


In [61]:
# Function to classify expenses
def classify_expense(row):
    if row['day'] in ['Monday', 'Wednesday', 'Friday'] or row['Time_of_Day'] == 'Night':
        return 'Less Essential'
    else:
        return 'Essential'

# Apply the classification logic
df['Class'] = df.apply(classify_expense, axis=1)



In [62]:
df.head()

Unnamed: 0,Date,Item,Amount,Time,day,Time_of_Day,Amount_Category,Category_alone,Category_friend,Class
0,2023-03-01,chai,70,7:00,Wednesday,0,Medium,1,0,Less Essential
1,2023-03-01,chai,200,10:00,Wednesday,0,High,0,1,Less Essential
2,2023-03-01,juice,150,13:00,Wednesday,0,High,0,1,Less Essential
3,2023-03-01,rikshow,120,14:00,Wednesday,0,High,1,0,Less Essential
4,2023-03-01,coffee,120,15:00,Wednesday,0,High,1,0,Less Essential


In [63]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score

# Define features (X) and target (y)
X = df[['day', 'Time_of_Day', 'Amount','Category_alone','Category_friend']]  # Features
y = df['Class']  # Target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess categorical columns using OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['day', 'Time_of_Day','Category_friend','Category_alone']),
        ('num', 'passthrough', ['Amount'])
    ]
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Train a Random Forest Classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 1.0

Classification Report:
                 precision    recall  f1-score   support

     Essential       1.00      1.00      1.00        14
Less Essential       1.00      1.00      1.00        15

      accuracy                           1.00        29
     macro avg       1.00      1.00      1.00        29
  weighted avg       1.00      1.00      1.00        29



In [41]:
from sklearn.model_selection import train_test_split

X = df.drop(['Amount_Category', 'Amount'], axis=1)  # Features (exclude the target and Amount columns)
y = df['Amount_Category']  # Target variable

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Identify categorical and numerical columns
categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Scale numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # One-hot encode categorical features
    ]
)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Predict on test data
y_pred = classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9655172413793104

Classification Report:
               precision    recall  f1-score   support

         Low       0.97      1.00      0.98        28
      Medium       0.00      0.00      0.00         1

    accuracy                           0.97        29
   macro avg       0.48      0.50      0.49        29
weighted avg       0.93      0.97      0.95        29



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
print(df['Time'].unique())

df = df[pd.to_datetime(df['Time'], errors='coerce').notna()]

df['Hour'] = pd.to_datetime(df['Time']).dt.hour

le = LabelEncoder()
df['Time_of_Day'] = le.fit_transform(df['Time_of_Day'])



['7:00' '10:00' '13:00' '14:00' '15:00' '17:00' '21:30' '12:00' '18:00'
 '20:00' '10:32' '22:30' '12:30' '15:30' '17:30' '9:30' '22:40' '17:20'
 '9:00' '19:00' '21:00' '1.083333333' '23:30' '10:30' '19:30' '11:00'
 '14:10' '18:30' '14:30' '16:00']


In [83]:
df = pd.get_dummies(df, columns=['Category'], drop_first=True)

features = ['Amount', 'Hour'] + [col for col in df.columns if 'Category_' in col]


In [84]:
X = df[features]  # Features
y = df['Time_of_Day']  # Target variable
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [85]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)



In [86]:
y_pred = model.predict(X_test)


In [87]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[29]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        29

    accuracy                           1.00        29
   macro avg       1.00      1.00      1.00        29
weighted avg       1.00      1.00      1.00        29



In [88]:
df.head()

Unnamed: 0,Date,Item,Amount,Time,day,Time_of_Day,Hour,Category_friend
0,2023-03-01,chai,7,7:00,Wednesday,0,7,0
1,2023-03-01,chai,20,10:00,Wednesday,0,10,1
2,2023-03-01,juice,15,13:00,Wednesday,0,13,1
3,2023-03-01,rikshow,12,14:00,Wednesday,0,14,0
4,2023-03-01,coffee,12,15:00,Wednesday,0,15,0
