# Data Exploration - ML Pipeline Platform

Quick exploration of transaction data for fraud detection model.

**Purpose**: Validate data quality and identify key features for the ML pipeline.

In [None]:
# Essential imports
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Configure display
pd.set_option('display.max_columns', 10)
plt.style.use('seaborn-v0_8-darkgrid')
print("Environment ready")

## 1. Load Data

In [None]:
# Load sample data or generate if not exists
data_path = Path('../sample_data/demo/datasets/fraud_detection.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} transactions from {data_path}")
else:
    # Generate sample data if file doesn't exist
    np.random.seed(42)
    n_samples = 1000

    df = pd.DataFrame({
        'amount': np.random.lognormal(4, 2, n_samples),
        'merchant_category': np.random.choice(['electronics', 'grocery', 'gas', 'restaurant', 'online'], n_samples),
        'hour_of_day': np.random.randint(0, 24, n_samples),
        'is_weekend': np.random.choice([0, 1], n_samples),
        'risk_score': np.random.beta(2, 5, n_samples),
        'days_since_last': np.random.exponential(5, n_samples),
        'num_transactions_today': np.random.poisson(3, n_samples),
        'label': np.random.choice([0, 1], n_samples, p=[0.95, 0.05])  # 5% fraud rate
    })

    # Make fraud transactions look different
    fraud_idx = df['label'] == 1
    df.loc[fraud_idx, 'risk_score'] *= 2
    df.loc[fraud_idx, 'amount'] *= 1.5

    print(f"Generated {len(df)} sample transactions")

df.head()

## 2. Quick Data Quality Check

In [None]:
# Data quality summary
print("DATA QUALITY SUMMARY")
print("="*50)
print(f"Shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicates: {df.duplicated().sum()}")
print(f"\nFraud rate: {df['label'].mean()*100:.1f}%")
print("\nData types:")
print(df.dtypes.value_counts())

# Basic statistics
print("\nNumerical features summary:")
df.describe().round(2)

## 3. Feature Analysis

In [None]:
# Key feature distributions by fraud label
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Amount distribution
for label in [0, 1]:
    data = df[df['label'] == label]['amount']
    axes[0,0].hist(data, alpha=0.6, label=f"{'Fraud' if label else 'Normal'}", bins=20)
axes[0,0].set_xlabel('Transaction Amount')
axes[0,0].set_ylabel('Frequency')
axes[0,0].legend()
axes[0,0].set_title('Amount Distribution')

# Risk score distribution
for label in [0, 1]:
    data = df[df['label'] == label]['risk_score']
    axes[0,1].hist(data, alpha=0.6, label=f"{'Fraud' if label else 'Normal'}", bins=20)
axes[0,1].set_xlabel('Risk Score')
axes[0,1].legend()
axes[0,1].set_title('Risk Score Distribution')

# Merchant category fraud rates
fraud_by_merchant = df.groupby('merchant_category')['label'].mean()
axes[1,0].bar(fraud_by_merchant.index, fraud_by_merchant.values)
axes[1,0].set_xlabel('Merchant Category')
axes[1,0].set_ylabel('Fraud Rate')
axes[1,0].set_title('Fraud Rate by Merchant')
axes[1,0].tick_params(axis='x', rotation=45)

# Hour of day patterns
hourly_fraud = df.groupby('hour_of_day')['label'].mean()
axes[1,1].plot(hourly_fraud.index, hourly_fraud.values, 'o-')
axes[1,1].set_xlabel('Hour of Day')
axes[1,1].set_ylabel('Fraud Rate')
axes[1,1].set_title('Fraud Rate by Hour')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Key observations:")
print(f"‚Ä¢ Fraud transactions have {df[df['label']==1]['amount'].mean()/df[df['label']==0]['amount'].mean():.1f}x higher amounts")
print(f"‚Ä¢ Risk score for fraud: {df[df['label']==1]['risk_score'].mean():.3f} vs normal: {df[df['label']==0]['risk_score'].mean():.3f}")

## 4. Feature Correlation

In [None]:
# Select numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns

# Calculate correlations with fraud label
fraud_correlations = df[numerical_cols].corr()['label'].sort_values(ascending=False)

print("Feature Correlations with Fraud Label:")
print("="*40)
for feature, corr in fraud_correlations.items():
    if feature != 'label':
        print(f"{feature:25s}: {corr:+.3f}")

# Visual correlation matrix
plt.figure(figsize=(8, 6))
import seaborn as sns

sns.heatmap(df[numerical_cols].corr(), annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 5. Feature Engineering Ideas

In [None]:
# Create derived features
df['amount_log'] = np.log1p(df['amount'])
df['risk_amount_interaction'] = df['risk_score'] * df['amount']
df['high_risk'] = (df['risk_score'] > df['risk_score'].quantile(0.75)).astype(int)
df['unusual_hour'] = df['hour_of_day'].apply(lambda x: 1 if x < 6 or x > 22 else 0)

# Evaluate new features
new_features = ['amount_log', 'risk_amount_interaction', 'high_risk', 'unusual_hour']
new_correlations = df[new_features + ['label']].corr()['label'].sort_values(ascending=False)

print("Engineered Feature Correlations:")
print("="*40)
for feature, corr in new_correlations.items():
    if feature != 'label':
        print(f"{feature:25s}: {corr:+.3f}")

## 6. Model Readiness Summary

In [None]:
print("\n" + "="*60)
print("MODEL READINESS ASSESSMENT")
print("="*60)

# Data readiness checklist
checklist = {
    "Sufficient data": len(df) >= 1000,
    "No missing values": df.isnull().sum().sum() == 0,
    "Balanced classes": df['label'].mean() > 0.01 and df['label'].mean() < 0.5,
    "Feature variation": df.std().min() > 0,
    "Numerical features": len(df.select_dtypes(include=[np.number]).columns) >= 3,
}

print("\nData Quality Checklist:")
for check, passed in checklist.items():
    status = "‚úÖ" if passed else "‚ùå"
    print(f"  {status} {check}")

print("\nüìä Dataset Statistics:")
print(f"  ‚Ä¢ Records: {len(df):,}")
print(f"  ‚Ä¢ Features: {len(df.columns)-1}")
print(f"  ‚Ä¢ Fraud rate: {df['label'].mean()*100:.1f}%")
print(f"  ‚Ä¢ Memory usage: {df.memory_usage(deep=True).sum()/1024**2:.2f} MB")

print("\nüéØ Recommended Features for Model:")
top_features = fraud_correlations.abs().nlargest(6).index.tolist()
top_features = [f for f in top_features if f != 'label']
for i, feature in enumerate(top_features, 1):
    print(f"  {i}. {feature} (correlation: {fraud_correlations[feature]:+.3f})")

print("\n‚úÖ Data is ready for model training!")
print("\nNext step: Run model_training_analysis.ipynb")

In [None]:
# Export preprocessed data for model training
output_path = Path('../sample_data/demo/datasets')
output_path.mkdir(parents=True, exist_ok=True)

# Save the enhanced dataset
df.to_csv(output_path / 'fraud_detection_processed.csv', index=False)
print(f"\nüíæ Saved processed data to {output_path / 'fraud_detection_processed.csv'}")
print(f"Features saved: {', '.join(df.columns)}")