# Data Overview â€” Fraud Detection Project 

This notebook gives a **project content** overview of both fraud datasets used in the pipeline.

## Objectives
1. Load e-commerce and credit card fraud data using shared project constants (`src`)
2. Report shape, columns, and class distribution for both datasets
3. Save overview plots (class balance) for presentation

In [3]:
import sys
!{sys.executable} -m pip install pandas matplotlib
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

import sys
sys.path.insert(0, str(Path().resolve().parent))
from src import (
    ECOMMERCE_RAW_FILE,
    CREDITCARD_RAW_FILE,
    ECOMMERCE_TARGET_COLUMN,
    CREDITCARD_TARGET_COLUMN,
)

project_root = Path().resolve()
if project_root.name == 'notebooks':
    project_root = project_root.parent
else:
    current = Path().resolve()
    while current != current.parent:
        if (current / 'data').exists():
            project_root = current
            break
        current = current.parent

DATA_DIR = project_root / 'data' / 'raw'
OUTPUT_DIR = project_root / 'outputs' / 'eda' / 'overview'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def save_plot(fig, filename, dpi=300, bbox_inches='tight'):
    filepath = OUTPUT_DIR / filename
    fig.savefig(filepath, dpi=dpi, bbox_inches=bbox_inches)
    rel = filepath.relative_to(project_root)
    print(f"Plot saved to: {rel}")

print(f"Data directory: {DATA_DIR.relative_to(project_root)}")
print(f"Output directory: {OUTPUT_DIR.relative_to(project_root)}")

Defaulting to user installation because normal site-packages is not writeable


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## 1. E-commerce fraud data (Fraud_Data.csv)

In [None]:
ecom_path = DATA_DIR / ECOMMERCE_RAW_FILE
ecom = pd.read_csv(ecom_path) if ecom_path.exists() else pd.DataFrame()
print(f"Shape: {ecom.shape}")
print(f"Columns: {ecom.columns.tolist()}")
if ECOMMERCE_TARGET_COLUMN in ecom.columns:
    print(ecom[ECOMMERCE_TARGET_COLUMN].value_counts())
ecom.head()

NameError: name 'DATA_DIR' is not defined

## 2. Credit card fraud data (creditcard.csv)

In [None]:
cc_path = DATA_DIR / CREDITCARD_RAW_FILE
cc = pd.read_csv(cc_path) if cc_path.exists() else pd.DataFrame()
print(f"Shape: {cc.shape}")
print(f"Columns: {cc.columns.tolist()}")
if CREDITCARD_TARGET_COLUMN in cc.columns:
    print(cc[CREDITCARD_TARGET_COLUMN].value_counts())
cc.head()

## 3. Class balance overview (both datasets)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

if ECOMMERCE_TARGET_COLUMN in ecom.columns:
    ecom[ECOMMERCE_TARGET_COLUMN].value_counts().sort_index().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
    axes[0].set_title('E-commerce: Class distribution')
    axes[0].set_xlabel('Class')
    axes[0].set_ylabel('Count')

if CREDITCARD_TARGET_COLUMN in cc.columns:
    cc[CREDITCARD_TARGET_COLUMN].value_counts().sort_index().plot(kind='bar', ax=axes[1], color=['#2ecc71', '#e74c3c'])
    axes[1].set_title('Credit card: Class distribution')
    axes[1].set_xlabel('Class')
    axes[1].set_ylabel('Count')

plt.tight_layout()
save_plot(fig, 'overview_class_balance_both_datasets.png')
plt.show()

NameError: name 'plt' is not defined