Personal Finance: Expense Data Cleaner

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest

In [None]:
# GitHub raw CSV file
url = "https://raw.githubusercontent.com/NickCutrone/Expense_Data_Cleanup/refs/heads/main/sample_data.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(url)

# Display the first few rows
print(df.head())

In [None]:
# Display first few rows
print(df.head())

In [None]:
# Standardize merchant names
merchant_mapping = {
    'Water Bill': 'Water Bill',
    'Spotify': 'Spotify',
    'Netflix': 'Netflix',
    'Amazon': 'Amazon',
    'Apple Store': 'Apple Store',
    'Electricity Bill': 'Electricity Bill',
    'Lyft': 'Lyft',
    'Insurance': 'Insurance',
    'Gym Membership': 'Gym Membership',
    'Uber': 'Uber',
    'Phone Bill': 'Phone Bill',
    "McDonald's": "McDonald's",
    'Walmart': 'Walmart',
    'Restaurant': 'Restaurant',
    'Rent Payment': 'Rent Payment',
    'Shell Gas': 'Shell Gas',
    'Starbucks': 'Starbucks',
    'Best Buy': 'Best Buy',
    'BP Gas': 'BP Gas',
    'Target': 'Target'
}
df['Merchant'] = df['Merchant'].replace(merchant_mapping)

In [None]:
# Data quality checks
df['Duplicate'] = df.duplicated(subset=['Date', 'Merchant', 'Amount'])
df['Missing Values'] = df.isnull().sum(axis=1) > 0

In [None]:
# Categorize expenses
category_mapping = {
    'Amazon': 'Shopping',
    'Uber': 'Transport',
    'Lyft': 'Transport',
    'Starbucks': 'Food & Drink',
    "McDonald's": 'Food & Drink',
    'Restaurant': 'Food & Drink',
    'Netflix': 'Entertainment',
    'Spotify': 'Entertainment',
    'Apple Store': 'Shopping',
    'Target': 'Shopping',
    'Best Buy': 'Electronics',
    'Shell Gas': 'Gas',
    'BP Gas': 'Gas',
    'Walmart': 'Groceries',
    'Rent Payment': 'Housing',
    'Gym Membership': 'Health & Fitness',
    'Insurance': 'Bills',
    'Phone Bill': 'Bills',
    'Electricity Bill': 'Utilities',
    'Water Bill': 'Utilities'
}
df['Category'] = df['Merchant'].map(category_mapping)

In [None]:
df['Category'] = df['Merchant'].map(category_mapping)

# Isolate rows where Merchant or Category is missing
unmapped_data = df[df['Merchant'].isna() | df['Category'].isna()]

# Display unmapped data
print(unmapped_data)

In [None]:
# Ensure 'Anomalous' column exists with boolean dtype before processing
df['Anomalous'] = False  # Default all values to False

# Detect anomalous expenses for Each Merchant and Category
for (merchant, category), group in df.groupby(['Merchant', 'Category']):
    if len(group) > 5:  # Ensure enough samples for Isolation Forest
        iso_forest = IsolationForest(contamination=0.05, random_state=42)
        anomaly_scores = iso_forest.fit_predict(group[['Amount']])
        df.loc[group.index, 'Anomaly_Score'] = anomaly_scores
        df.loc[group.index, 'Anomalous'] = anomaly_scores == -1

In [None]:
# Display anomalous expenses
print(df[df['Anomalous']])

In [None]:
# Save cleaned data to CSV
df.to_csv('cleaned_expenses.csv', index=False)

In [None]:
# Display first few rows of final processed DataFrame
print(df.head())