# Week 2 – Expense Tracking: Data Collection & Preprocessing in Python

# Step 1: Upload CSV Files

In [8]:
from google.colab import files

# Upload 'expenses.csv' and 'categories.csv'
uploaded = files.upload()


# Step 2: Import Required Libraries

In [9]:
import pandas as pd
import numpy as np

# Step 3: Load the CSV Files

In [10]:
expenses = pd.read_csv('expenses.csv')
categories = pd.read_csv('categories.csv')

# Step 4: Explore and Clean the Data

In [11]:
# Show original column names
print("Expenses Columns:", expenses.columns.tolist())
print("Categories Columns:", categories.columns.tolist())

# Normalize column names
expenses.columns = expenses.columns.str.strip().str.lower()
categories.columns = categories.columns.str.strip().str.lower()

# Drop missing data
expenses.dropna(inplace=True)
categories.dropna(inplace=True)

# Convert date columns
expenses['expense_date'] = pd.to_datetime(expenses['expense_date'], errors='coerce')
expenses.dropna(subset=['expense_date'], inplace=True)

# Rename columns for clarity
expenses.rename(columns={
    'user_id': 'UserID',
    'category_id': 'CategoryID',
    'amount': 'Amount',
    'expense_date': 'ExpenseDate',
    'description': 'Description'
}, inplace=True)

categories.rename(columns={
    'category_id': 'CategoryID',
    'name': 'Category'
}, inplace=True)

Expenses Columns: ['expense_id', 'user_id', 'category_id', 'amount', 'expense_date', 'description']
Categories Columns: ['category_id', 'name']


#Step 5: Merge & Analyze with NumPy


In [12]:

# Merge to add category names
df = pd.merge(expenses, categories, on='CategoryID', how='left')

# Extract month
df['Month'] = df['ExpenseDate'].dt.to_period('M')

# Convert amount to float
df['Amount'] = df['Amount'].replace('[\$,]', '', regex=True).astype(float)

# Monthly totals and averages
monthly_total = df.groupby('Month')['Amount'].sum()
monthly_avg = df.groupby('Month')['Amount'].mean()

print(" Monthly Totals:")
print(monthly_total)

print("\nMonthly Averages:")
print(monthly_avg)

# Category-wise breakdown
category_breakdown = df.groupby(['Month', 'Category'])['Amount'].sum().unstack().fillna(0)

print("\n Category-wise Monthly Breakdown:")
print(category_breakdown)

 Monthly Totals:
Month
2025-07    4000.0
Freq: M, Name: Amount, dtype: float64

Monthly Averages:
Month
2025-07    800.0
Freq: M, Name: Amount, dtype: float64

 Category-wise Monthly Breakdown:
Category  Entertainment  Groceries  Transport  Utilities
Month                                                   
2025-07           500.0     2100.0      300.0     1100.0


# Step 6: Export Cleaned & Summary Files

In [13]:
# Export cleaned & summary files
df.to_csv("cleaned_expenses.csv", index=False)
category_breakdown.to_csv("monthly_expense_report.csv")

# Download files
files.download("cleaned_expenses.csv")
files.download("monthly_expense_report.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Step 7: Display Cleaned Output

In [14]:
df_cleaned = pd.read_csv("cleaned_expenses.csv")
df_report = pd.read_csv("monthly_expense_report.csv")

print("\nCleaned Expenses Data:")
display(df_cleaned.head())

print("\nMonthly Category-wise Expense Report:")
display(df_report.head())


Cleaned Expenses Data:


Unnamed: 0,expense_id,UserID,CategoryID,Amount,ExpenseDate,Description,Category,Month
0,1,1,1,1200.0,2025-07-01,Monthly grocery shopping,Groceries,2025-07
1,2,1,2,300.0,2025-07-02,Bus pass,Transport,2025-07
2,3,1,3,500.0,2025-07-10,Movie night,Entertainment,2025-07
3,4,2,1,900.0,2025-07-03,Grocery,Groceries,2025-07
4,5,2,4,1100.0,2025-07-05,Electricity bill,Utilities,2025-07



Monthly Category-wise Expense Report:


Unnamed: 0,Month,Entertainment,Groceries,Transport,Utilities
0,2025-07,500.0,2100.0,300.0,1100.0


## Generate Per-User Monthly Expense Breakdown by Category

In [17]:
user_monthly_report = df.groupby(['UserID','Month', 'Category'])['Amount'].sum().unstack().fillna(0)

# Save to CSV
user_monthly_report.to_csv('per_user_monthly_report.csv')

# Download the report
files.download('per_user_monthly_report.csv')

# Display part of it
print("\nPer-User Monthly Category-wise Expense Report:")
display(user_monthly_report.head(10))  # Display only first 10 rows


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Per-User Monthly Category-wise Expense Report:


Unnamed: 0_level_0,Category,Entertainment,Groceries,Transport,Utilities
UserID,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2025-07,500.0,1200.0,300.0,0.0
2,2025-07,0.0,900.0,0.0,1100.0
