# Financial Data EDA and Preprocessing
**From Hasif's Workspace**

This notebook performs exploratory data analysis and preprocessing on financial transaction data for the AI-Powered Personal Finance Advisor.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Loading and Initial Exploration

In [None]:
# Generate synthetic data for demonstration
import sys
sys.path.append('../scripts')
from generate_synthetic_data import generate_transactions

# Generate sample data
df = generate_transactions(1000)
print(f"Generated {len(df)} transactions")
df.head()

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Data types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")

## 2. Data Preprocessing

In [None]:
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Add time-based features
df['day_of_week'] = df['date'].dt.day_name()
df['month'] = df['date'].dt.month_name()
df['quarter'] = df['date'].dt.to_period('Q').astype(str)
df['is_weekend'] = df['date'].dt.dayofweek.isin([5, 6])

print("Time-based features added successfully!")
df.head()

## 3. Exploratory Data Analysis

In [None]:
# Summary statistics
print("Summary Statistics:")
df.describe()

In [None]:
# Transaction distribution by category
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
category_counts = df['category'].value_counts()
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Transaction Distribution by Category')

plt.subplot(1, 2, 2)
category_amounts = df.groupby('category')['amount'].sum().sort_values(ascending=False)
plt.bar(range(len(category_amounts)), category_amounts.values)
plt.xticks(range(len(category_amounts)), category_amounts.index, rotation=45)
plt.title('Total Amount by Category')
plt.ylabel('Amount ($)')

plt.tight_layout()
plt.show()

In [None]:
# Spending patterns over time
plt.figure(figsize=(15, 10))

# Daily spending trend
plt.subplot(2, 2, 1)
daily_spending = df[df['transaction_type'] == 'Debit'].groupby('date')['amount'].sum()
plt.plot(daily_spending.index, daily_spending.values)
plt.title('Daily Spending Trend')
plt.xlabel('Date')
plt.ylabel('Amount ($)')
plt.xticks(rotation=45)

# Weekly pattern
plt.subplot(2, 2, 2)
weekly_pattern = df[df['transaction_type'] == 'Debit'].groupby('day_of_week')['amount'].sum()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekly_pattern = weekly_pattern.reindex(day_order)
plt.bar(weekly_pattern.index, weekly_pattern.values)
plt.title('Weekly Spending Pattern')
plt.xlabel('Day of Week')
plt.ylabel('Total Amount ($)')
plt.xticks(rotation=45)

# Monthly pattern
plt.subplot(2, 2, 3)
monthly_pattern = df[df['transaction_type'] == 'Debit'].groupby('month')['amount'].sum()
plt.bar(monthly_pattern.index, monthly_pattern.values)
plt.title('Monthly Spending Pattern')
plt.xlabel('Month')
plt.ylabel('Total Amount ($)')
plt.xticks(rotation=45)

# Amount distribution
plt.subplot(2, 2, 4)
plt.hist(df['amount'], bins=50, alpha=0.7)
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount ($)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

## 4. Feature Engineering

In [None]:
# Create additional features for analysis
df_sorted = df.sort_values('date')

# Rolling statistics
df_sorted['rolling_mean_7'] = df_sorted['amount'].rolling(window=7, min_periods=1).mean()
df_sorted['rolling_std_7'] = df_sorted['amount'].rolling(window=7, min_periods=1).std()

# Lagged features
df_sorted['amount_lag_1'] = df_sorted['amount'].shift(1)
df_sorted['amount_lag_7'] = df_sorted['amount'].shift(7)

# Z-score for anomaly detection
df_sorted['amount_zscore'] = (df_sorted['amount'] - df_sorted['amount'].mean()) / df_sorted['amount'].std()

print("Feature engineering completed!")
print(f"New features: {['rolling_mean_7', 'rolling_std_7', 'amount_lag_1', 'amount_lag_7', 'amount_zscore']}")

## 5. Data Quality Assessment

In [None]:
# Check for outliers using IQR method
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['amount'] < lower_bound) | (df['amount'] > upper_bound)]
print(f"Number of outliers: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")

# Display outliers
if len(outliers) > 0:
    print("\nOutlier transactions:")
    print(outliers[['date', 'description', 'amount', 'category']].head(10))

In [None]:
# Correlation analysis
numeric_cols = df_sorted.select_dtypes(include=[np.number]).columns
correlation_matrix = df_sorted[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 6. Save Processed Data

In [None]:
# Save the processed dataset
output_path = '../data/processed_transactions.csv'
df_sorted.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")

# Summary of the processed dataset
print(f"\nFinal dataset shape: {df_sorted.shape}")
print(f"Date range: {df_sorted['date'].min()} to {df_sorted['date'].max()}")
print(f"Total amount: ${df_sorted['amount'].sum():,.2f}")
print(f"Categories: {df_sorted['category'].nunique()}")