# 02 â€” EDA & Feature Engineering

Goals:
- Understand data distribution, correlations, fraud patterns
- Build rolling windows, aggregates, velocity features, risk heuristics
- Export ML-ready feature matrix


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
sys.path.append('../')

from src.data.preprocess import load_raw_data, clean_data
from src.data.feature_engineering import FeatureEngineer

RAW_DIR = Path("../data/raw")
PROCESSED_DIR = Path("../data/processed")


## 1. Load Data


In [None]:
df = load_raw_data(RAW_DIR / "transactions_raw.csv")
print(f"Loaded {len(df):,} transactions")
df.head()


## 2. Exploratory Analysis


In [None]:
# Fraud rate
print(f"Fraud rate: {df['fraud'].mean():.2%}")
print(f"Fraudulent: {df['fraud'].sum():,}, Legitimate: {(df['fraud']==0).sum():,}")

# Amount distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].hist(df[df['fraud']==0]['amount'], bins=50, alpha=0.7, label='Legitimate', density=True)
axes[0].hist(df[df['fraud']==1]['amount'], bins=50, alpha=0.7, label='Fraud', density=True)
axes[0].set_xlabel('Amount')
axes[0].set_title('Amount Distribution')
axes[0].legend()
plt.tight_layout()
plt.show()


## 3. Feature Engineering


In [None]:
# Clean and engineer features
df_clean = clean_data(df)
engineer = FeatureEngineer()
df_features = engineer.engineer_features(df_clean)

print(f"Features created: {len(engineer.feature_names)}")
print(f"Shape: {df_features.shape}")


## 4. Export Feature Matrix


In [None]:
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
df_features.to_csv(PROCESSED_DIR / "features.csv", index=False)
print(f"Saved feature matrix to {PROCESSED_DIR / 'features.csv'}")
