# 01 — Exploratory Data Analysis

This notebook explores the Credit Card Fraud Detection dataset.

**Contents:**
1. Load & inspect the dataset
2. Class imbalance visualization
3. Feature distributions
4. Correlation analysis

In [None]:
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Allow imports from project root
sys.path.insert(0, str(Path.cwd().parent))

pd.set_option("display.max_columns", 35)
pd.set_option("display.float_format", "{:.4f}".format)

## 1. Load & Inspect the Dataset

In [None]:
# Load the full dataset if available, otherwise fall back to the sample
DATA_DIR = Path.cwd().parent / "data"
full_path = DATA_DIR / "raw" / "creditcard.csv"
sample_path = DATA_DIR / "sample" / "sample_transactions.csv"

if full_path.exists():
    df = pd.read_csv(full_path)
    print(f"Loaded FULL dataset: {full_path}")
else:
    df = pd.read_csv(sample_path)
    print(f"Full dataset not found — using SAMPLE: {sample_path}")

print(f"Shape: {df.shape}")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Check for missing values
missing = df.isna().sum()
print("Missing values per column:")
print(missing[missing > 0] if missing.any() else "None")

## 2. Class Imbalance Visualization

In [None]:
class_counts = df["Class"].value_counts().sort_index()
labels = ["Legitimate (0)", "Fraud (1)"]

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Transaction Counts by Class", "Class Proportion"),
    specs=[[{"type": "bar"}, {"type": "pie"}]],
)

fig.add_trace(
    go.Bar(x=labels, y=class_counts.values, marker_color=["#2ecc71", "#e74c3c"]),
    row=1, col=1,
)
fig.add_trace(
    go.Pie(labels=labels, values=class_counts.values,
           marker_colors=["#2ecc71", "#e74c3c"], hole=0.4),
    row=1, col=2,
)

fig.update_layout(title_text="Class Distribution", showlegend=False, height=400)
fig.show()

print(f"Fraud rate: {df['Class'].mean():.4%}")
print(f"Fraud count: {int(class_counts.get(1, 0))} / {len(df)}")

## 3. Feature Distributions

In [None]:
# Amount distribution
fig = px.histogram(
    df, x="Amount", color="Class", nbins=100, marginal="box",
    color_discrete_map={0: "#2ecc71", 1: "#e74c3c"},
    title="Transaction Amount Distribution by Class",
    labels={"Amount": "Transaction Amount ($)", "count": "Frequency"},
)
fig.update_layout(height=450)
fig.show()

In [None]:
# Time distribution
fig = px.histogram(
    df, x="Time", color="Class", nbins=100,
    color_discrete_map={0: "#2ecc71", 1: "#e74c3c"},
    title="Transaction Time Distribution by Class",
    labels={"Time": "Time (seconds from first transaction)", "count": "Frequency"},
)
fig.update_layout(height=400)
fig.show()

In [None]:
# PCA feature distributions (V1–V28) — compare fraud vs. legitimate
v_features = [f"V{i}" for i in range(1, 29)]

fig = make_subplots(rows=7, cols=4, subplot_titles=v_features)

for idx, feat in enumerate(v_features):
    row = idx // 4 + 1
    col = idx % 4 + 1
    for cls, color, name in [(0, "#2ecc71", "Legit"), (1, "#e74c3c", "Fraud")]:
        subset = df[df["Class"] == cls][feat]
        fig.add_trace(
            go.Histogram(x=subset, name=name, marker_color=color,
                         opacity=0.6, showlegend=(idx == 0)),
            row=row, col=col,
        )

fig.update_layout(height=1400, title_text="PCA Feature Distributions (V1–V28)",
                  barmode="overlay")
fig.show()

## 4. Correlation Analysis

In [None]:
# Correlation with target variable
correlations = df.corr()["Class"].drop("Class").sort_values()

fig = go.Figure(go.Bar(
    x=correlations.values,
    y=correlations.index,
    orientation="h",
    marker_color=["#e74c3c" if v < 0 else "#2ecc71" for v in correlations.values],
))
fig.update_layout(
    title="Feature Correlation with Fraud (Class)",
    xaxis_title="Pearson Correlation",
    height=700,
)
fig.show()

In [None]:
# Full correlation heatmap
corr_matrix = df[v_features + ["Amount", "Time", "Class"]].corr()

fig = go.Figure(go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.index,
    colorscale="RdBu_r",
    zmin=-1, zmax=1,
))
fig.update_layout(
    title="Feature Correlation Heatmap",
    height=800, width=800,
)
fig.show()

In [None]:
# Top correlated features with Class
top_pos = correlations.tail(5)
top_neg = correlations.head(5)

print("Top positively correlated features with fraud:")
for feat, corr in top_pos.items():
    print(f"  {feat}: {corr:+.4f}")

print("\nTop negatively correlated features with fraud:")
for feat, corr in top_neg.items():
    print(f"  {feat}: {corr:+.4f}")