# IEEE-CIS Fraud Detection: Exploratory Data Analysis

**Objective:** Comprehensive analysis of the IEEE-CIS fraud detection dataset to understand data characteristics, class imbalance, and feature distributions.

**Dataset Overview:**
- **Source:** IEEE Computational Intelligence Society
- **Problem Type:** Binary classification (fraud detection)
- **Target Variable:** `isFraud` (0: legitimate, 1: fraudulent)
- **Expected Challenge:** Highly imbalanced dataset (~3.5% fraud rate)

**Analysis Goals:**
1. Data quality assessment and missing value patterns
2. Target variable distribution and class imbalance quantification
3. Feature type categorization and statistical summaries
4. Temporal patterns and transaction characteristics
5. Initial insights for feature engineering strategy

In [1]:
# Import essential libraries for data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import gc

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Set random seed for reproducibility
np.random.seed(42)

print("Environment setup complete")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

ModuleNotFoundError: No module named 'matplotlib'

## 1. Data Loading and Initial Assessment

In [None]:
# Define data paths
DATA_PATH = Path('../data/01_raw')

# Load training datasets
print("Loading training transaction data...")
train_transaction = pd.read_csv(DATA_PATH / 'train_transaction.csv')

print("Loading training identity data...")
train_identity = pd.read_csv(DATA_PATH / 'train_identity.csv')

print("\n=== Dataset Shapes ===")
print(f"Transaction data: {train_transaction.shape}")
print(f"Identity data: {train_identity.shape}")

# Memory usage assessment
print("\n=== Memory Usage ===")
print(f"Transaction data: {train_transaction.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Identity data: {train_identity.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Merge transaction and identity data
print("Merging transaction and identity datasets...")
train_data = train_transaction.merge(train_identity, on='TransactionID', how='left')

print(f"Merged dataset shape: {train_data.shape}")
print(f"Memory usage after merge: {train_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Clean up memory
del train_transaction, train_identity
gc.collect()

print("\n=== Basic Dataset Information ===")
print(train_data.info(memory_usage='deep'))

## 2. Target Variable Analysis

In [None]:
# Analyze target variable distribution
print("=== Target Variable Distribution ===")
fraud_counts = train_data['isFraud'].value_counts()
fraud_percentages = train_data['isFraud'].value_counts(normalize=True) * 100

print(f"Legitimate transactions (0): {fraud_counts[0]:,} ({fraud_percentages[0]:.2f}%)")
print(f"Fraudulent transactions (1): {fraud_counts[1]:,} ({fraud_percentages[1]:.2f}%)")
print(f"Class imbalance ratio: {fraud_counts[0] / fraud_counts[1]:.1f}:1")

# Visualize target distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Count plot
fraud_counts.plot(kind='bar', ax=ax1, color=['#3498db', '#e74c3c'])
ax1.set_title('Transaction Count by Class', fontsize=14, fontweight='bold')
ax1.set_xlabel('Class (0: Legitimate, 1: Fraudulent)')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=0)

# Percentage plot
fraud_percentages.plot(kind='bar', ax=ax2, color=['#3498db', '#e74c3c'])
ax2.set_title('Transaction Percentage by Class', fontsize=14, fontweight='bold')
ax2.set_xlabel('Class (0: Legitimate, 1: Fraudulent)')
ax2.set_ylabel('Percentage (%)')
ax2.tick_params(axis='x', rotation=0)

# Add value labels on bars
for ax in [ax1, ax2]:
    for i, v in enumerate(ax.patches):
        height = v.get_height()
        if ax == ax1:
            ax.text(v.get_x() + v.get_width()/2., height + height*0.01, f'{int(height):,}',
                   ha='center', va='bottom', fontweight='bold')
        else:
            ax.text(v.get_x() + v.get_width()/2., height + height*0.01, f'{height:.2f}%',
                   ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 3. Feature Type Analysis

In [None]:
# Categorize features by type
print("=== Feature Type Analysis ===")

# Get data types
dtypes_df = pd.DataFrame({
    'Feature': train_data.columns,
    'Type': train_data.dtypes.values,
    'Non_Null_Count': train_data.count().values,
    'Null_Count': train_data.isnull().sum().values,
    'Null_Percentage': (train_data.isnull().sum() / len(train_data) * 100).values
})

# Categorize features
numerical_features = dtypes_df[dtypes_df['Type'].isin(['int64', 'float64'])]['Feature'].tolist()
categorical_features = dtypes_df[dtypes_df['Type'] == 'object']['Feature'].tolist()

# Remove target and ID from numerical features
if 'isFraud' in numerical_features:
    numerical_features.remove('isFraud')
if 'TransactionID' in numerical_features:
    numerical_features.remove('TransactionID')

print(f"Total features: {len(train_data.columns)}")
print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")
print(f"Target variable: isFraud")
print(f"Identifier: TransactionID")

# Display features with high missing values
high_missing = dtypes_df[dtypes_df['Null_Percentage'] > 50].sort_values('Null_Percentage', ascending=False)
print(f"\nFeatures with >50% missing values: {len(high_missing)}")
if len(high_missing) > 0:
    print(high_missing[['Feature', 'Null_Percentage']].head(10))

In [None]:
# Visualize missing value patterns
plt.figure(figsize=(15, 8))

# Calculate missing percentages
missing_percentages = (train_data.isnull().sum() / len(train_data) * 100).sort_values(ascending=False)
features_with_missing = missing_percentages[missing_percentages > 0]

if len(features_with_missing) > 0:
    # Plot top 30 features with missing values
    top_missing = features_with_missing.head(30)
    
    plt.barh(range(len(top_missing)), top_missing.values, color='#e74c3c')
    plt.yticks(range(len(top_missing)), top_missing.index)
    plt.xlabel('Missing Percentage (%)')
    plt.title('Top 30 Features with Missing Values', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    
    # Add percentage labels
    for i, v in enumerate(top_missing.values):
        plt.text(v + 1, i, f'{v:.1f}%', va='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nSummary: {len(features_with_missing)} features have missing values")
    print(f"Range: {features_with_missing.min():.2f}% to {features_with_missing.max():.2f}%")
else:
    print("No missing values found in the dataset")

## 4. Transaction Amount Analysis

In [None]:
# Analyze TransactionAmt distribution
print("=== Transaction Amount Analysis ===")

# Basic statistics
amount_stats = train_data['TransactionAmt'].describe()
print("Transaction Amount Statistics:")
print(amount_stats)

# Fraud vs legitimate amounts
fraud_amounts = train_data[train_data['isFraud'] == 1]['TransactionAmt']
legit_amounts = train_data[train_data['isFraud'] == 0]['TransactionAmt']

print(f"\nFraudulent transactions - Mean: ${fraud_amounts.mean():.2f}, Median: ${fraud_amounts.median():.2f}")
print(f"Legitimate transactions - Mean: ${legit_amounts.mean():.2f}, Median: ${legit_amounts.median():.2f}")

# Visualization
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Overall distribution (log scale)
train_data['TransactionAmt'].apply(np.log1p).hist(bins=50, ax=ax1, alpha=0.7, color='#3498db')
ax1.set_title('Transaction Amount Distribution (Log Scale)', fontweight='bold')
ax1.set_xlabel('Log(Transaction Amount + 1)')
ax1.set_ylabel('Frequency')

# Box plot by fraud status
train_data.boxplot(column='TransactionAmt', by='isFraud', ax=ax2)
ax2.set_title('Transaction Amount by Fraud Status', fontweight='bold')
ax2.set_xlabel('Fraud Status (0: Legitimate, 1: Fraudulent)')
ax2.set_ylabel('Transaction Amount')
ax2.set_yscale('log')

# Fraud vs legitimate distributions
fraud_amounts.apply(np.log1p).hist(bins=50, ax=ax3, alpha=0.7, color='#e74c3c', label='Fraudulent')
legit_amounts.apply(np.log1p).hist(bins=50, ax=ax3, alpha=0.7, color='#3498db', label='Legitimate')
ax3.set_title('Amount Distribution by Class (Log Scale)', fontweight='bold')
ax3.set_xlabel('Log(Transaction Amount + 1)')
ax3.set_ylabel('Frequency')
ax3.legend()

# Amount ranges analysis
amount_ranges = pd.cut(train_data['TransactionAmt'], 
                      bins=[0, 50, 100, 500, 1000, 5000, float('inf')], 
                      labels=['$0-50', '$50-100', '$100-500', '$500-1K', '$1K-5K', '$5K+'])
fraud_by_range = train_data.groupby(amount_ranges)['isFraud'].agg(['count', 'sum', 'mean'])
fraud_by_range.columns = ['Total_Transactions', 'Fraud_Count', 'Fraud_Rate']
fraud_by_range['Fraud_Rate'].plot(kind='bar', ax=ax4, color='#e74c3c')
ax4.set_title('Fraud Rate by Transaction Amount Range', fontweight='bold')
ax4.set_xlabel('Transaction Amount Range')
ax4.set_ylabel('Fraud Rate')
ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\nFraud Rate by Amount Range:")
print(fraud_by_range)

## 5. Categorical Features Analysis

In [None]:
# Analyze categorical features
print("=== Categorical Features Analysis ===")

if len(categorical_features) > 0:
    print(f"Found {len(categorical_features)} categorical features:")
    
    cat_analysis = []
    for feature in categorical_features:
        unique_count = train_data[feature].nunique()
        missing_pct = train_data[feature].isnull().sum() / len(train_data) * 100
        most_common = train_data[feature].mode().iloc[0] if unique_count > 0 else 'N/A'
        
        cat_analysis.append({
            'Feature': feature,
            'Unique_Values': unique_count,
            'Missing_Percentage': missing_pct,
            'Most_Common': most_common
        })
    
    cat_df = pd.DataFrame(cat_analysis)
    print(cat_df)
    
    # Analyze top categorical features with reasonable cardinality
    analyzable_cats = cat_df[(cat_df['Unique_Values'] > 1) & 
                            (cat_df['Unique_Values'] <= 20) & 
                            (cat_df['Missing_Percentage'] < 80)]
    
    if len(analyzable_cats) > 0:
        print(f"\nAnalyzing {len(analyzable_cats)} categorical features with reasonable cardinality...")
        
        # Plot fraud rates for top categorical features
        n_plots = min(4, len(analyzable_cats))
        if n_plots > 0:
            fig, axes = plt.subplots(2, 2, figsize=(15, 10))
            axes = axes.ravel()
            
            for i, feature in enumerate(analyzable_cats['Feature'].head(n_plots)):
                fraud_by_cat = train_data.groupby(feature)['isFraud'].agg(['count', 'mean']).reset_index()
                fraud_by_cat = fraud_by_cat[fraud_by_cat['count'] >= 100]  # Filter low-frequency categories
                
                if len(fraud_by_cat) > 0:
                    fraud_by_cat.plot(x=feature, y='mean', kind='bar', ax=axes[i], color='#e74c3c')
                    axes[i].set_title(f'Fraud Rate by {feature}', fontweight='bold')
                    axes[i].set_ylabel('Fraud Rate')
                    axes[i].tick_params(axis='x', rotation=45)
            
            # Hide empty subplots
            for i in range(n_plots, 4):
                axes[i].set_visible(False)
            
            plt.tight_layout()
            plt.show()
else:
    print("No categorical features found in the dataset")

## 6. Feature Correlation Analysis

In [None]:
# Correlation analysis with target variable
print("=== Feature Correlation with Target ===")

# Calculate correlations with target (only numerical features)
numeric_cols = train_data.select_dtypes(include=[np.number]).columns.tolist()
if 'TransactionID' in numeric_cols:
    numeric_cols.remove('TransactionID')

correlations = train_data[numeric_cols].corr()['isFraud'].abs().sort_values(ascending=False)
correlations = correlations.drop('isFraud')  # Remove self-correlation

print("Top 20 features most correlated with fraud:")
top_corr = correlations.head(20)
print(top_corr)

# Visualize top correlations
plt.figure(figsize=(12, 8))
top_corr.plot(kind='barh', color='#3498db')
plt.title('Top 20 Features Correlated with Fraud', fontsize=14, fontweight='bold')
plt.xlabel('Absolute Correlation with isFraud')
plt.gca().invert_yaxis()

# Add correlation values as text
for i, v in enumerate(top_corr.values):
    plt.text(v + 0.001, i, f'{v:.3f}', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

# Feature correlation heatmap (top correlated features)
if len(top_corr) >= 10:
    top_features = ['isFraud'] + top_corr.head(10).index.tolist()
    corr_matrix = train_data[top_features].corr()
    
    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
                square=True, linewidths=0.5, cbar_kws={"shrink": .8})
    plt.title('Correlation Matrix: Top Features + Target', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

## 7. Data Quality Summary and Next Steps

In [None]:
# Comprehensive data quality summary
print("=== COMPREHENSIVE DATA QUALITY SUMMARY ===")
print(f"Dataset Shape: {train_data.shape}")
print(f"Memory Usage: {train_data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print()

print("CLASS IMBALANCE:")
fraud_rate = train_data['isFraud'].mean() * 100
print(f"- Fraud Rate: {fraud_rate:.2f}%")
print(f"- Imbalance Ratio: {(100-fraud_rate)/fraud_rate:.1f}:1")
print(f"- Classification Challenge: HIGH (severely imbalanced)")
print()

print("FEATURE COMPOSITION:")
print(f"- Total Features: {len(train_data.columns) - 1}")
print(f"- Numerical Features: {len(numerical_features)}")
print(f"- Categorical Features: {len(categorical_features)}")
print()

print("MISSING VALUES:")
features_with_missing = (train_data.isnull().sum() > 0).sum()
avg_missing = train_data.isnull().sum().mean()
max_missing = train_data.isnull().sum().max() / len(train_data) * 100
print(f"- Features with Missing Values: {features_with_missing}")
print(f"- Maximum Missing Percentage: {max_missing:.1f}%")
print(f"- Data Quality Challenge: {'HIGH' if max_missing > 50 else 'MODERATE' if max_missing > 20 else 'LOW'}")
print()

print("TOP PREDICTIVE FEATURES:")
if len(correlations) > 0:
    for i, (feature, corr) in enumerate(correlations.head(5).items()):
        print(f"- {i+1}. {feature}: {corr:.3f}")
else:
    print("- No significant correlations found")
print()

print("=== RECOMMENDED NEXT STEPS ===")
print("1. FEATURE ENGINEERING:")
print("   - Create temporal features from TransactionDT")
print("   - Engineer interaction features between high-correlation variables")
print("   - Develop aggregation features (velocity, frequency patterns)")
print("   - Handle missing values strategically (imputation vs. indicator variables)")
print()

print("2. PREPROCESSING PIPELINE:")
print("   - Implement robust scaling for numerical features")
print("   - Encode categorical variables (target encoding for high cardinality)")
print("   - Feature selection based on importance and correlation")
print()

print("3. IMBALANCED LEARNING STRATEGY:")
print("   - Implement stratified sampling for train/validation split")
print("   - Apply SMOTE or ADASYN for synthetic minority oversampling")
print("   - Use cost-sensitive learning with class weights")
print("   - Focus on Precision-Recall metrics over accuracy")
print()

print("4. MODEL DEVELOPMENT:")
print("   - Start with Logistic Regression baseline")
print("   - Progress to ensemble methods (XGBoost, LightGBM)")
print("   - Implement proper cross-validation strategy")
print("   - Target AUC-ROC ≥ 0.87 as specified")

# Save feature lists for next notebooks
feature_info = {
    'numerical_features': numerical_features,
    'categorical_features': categorical_features,
    'top_correlated_features': correlations.head(20).index.tolist() if len(correlations) > 0 else [],
    'high_missing_features': dtypes_df[dtypes_df['Null_Percentage'] > 50]['Feature'].tolist()
}

print(f"\n=== ANALYSIS COMPLETE ===")
print(f"Feature information saved for subsequent analysis notebooks")
print(f"Ready to proceed with feature engineering and preprocessing")