# IEEE-CIS Fraud Detection - EDA: Fraud Patterns

**Objective**: Identify key fraud signals in the IEEE-CIS dataset to guide feature engineering.

**Focus Areas**:
- Fraud rate and class distribution
- Top features correlated with fraud
- Missing data patterns
- Transaction amount distribution
- Temporal fraud patterns

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Data Loading and Merging

In [None]:
# Load datasets
DATA_PATH = Path('../data/raw/')

print("Loading transaction data...")
train_transaction = pd.read_csv(DATA_PATH / 'train_transaction.csv')
print(f"Transaction shape: {train_transaction.shape}")

print("\nLoading identity data...")
train_identity = pd.read_csv(DATA_PATH / 'train_identity.csv')
print(f"Identity shape: {train_identity.shape}")

In [None]:
# Merge on TransactionID
df = train_transaction.merge(train_identity, on='TransactionID', how='left')
print(f"Merged dataset shape: {df.shape}")
print(f"Total features: {df.shape[1]}")

## 2. Fraud Rate Calculation

In [None]:
# Calculate fraud statistics
fraud_count = df['isFraud'].sum()
total_count = len(df)
fraud_rate = fraud_count / total_count * 100

print("="*50)
print("FRAUD STATISTICS")
print("="*50)
print(f"Total transactions: {total_count:,}")
print(f"Fraudulent: {fraud_count:,} ({fraud_rate:.2f}%)")
print(f"Legitimate: {total_count - fraud_count:,} ({100-fraud_rate:.2f}%)")
print(f"\nClass imbalance ratio: 1:{int((total_count-fraud_count)/fraud_count)}")

## 3. Class Distribution Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart
class_counts = df['isFraud'].value_counts()
colors = ['#2ecc71', '#e74c3c']
axes[0].bar(['Legitimate (0)', 'Fraud (1)'], class_counts.values, color=colors)
axes[0].set_ylabel('Count')
axes[0].set_title('Transaction Class Distribution')
for i, v in enumerate(class_counts.values):
    axes[0].text(i, v + 5000, f'{v:,}', ha='center', fontweight='bold')

# Pie chart (log scale effect via explode)
axes[1].pie([total_count - fraud_count, fraud_count], 
            labels=['Legitimate', 'Fraud'],
            autopct='%1.2f%%', colors=colors, explode=(0, 0.1),
            shadow=True, startangle=90)
axes[1].set_title('Fraud Rate Distribution')

plt.tight_layout()
plt.savefig('../data/processed/class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Top 10 Features by Correlation with isFraud

In [None]:
# Calculate correlations with isFraud (numeric columns only)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('isFraud')
numeric_cols.remove('TransactionID')

correlations = df[numeric_cols].corrwith(df['isFraud']).abs().sort_values(ascending=False)

# Top 10 features
top_10_features = correlations.head(10)
print("TOP 10 FEATURES BY CORRELATION WITH FRAUD")
print("="*50)
for i, (feat, corr) in enumerate(top_10_features.items(), 1):
    print(f"{i:2}. {feat:<20} | Correlation: {corr:.4f}")

In [None]:
# Visualize top correlations
plt.figure(figsize=(10, 6))
colors = plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, 10))
bars = plt.barh(top_10_features.index[::-1], top_10_features.values[::-1], color=colors[::-1])
plt.xlabel('Absolute Correlation with isFraud')
plt.title('Top 10 Features Correlated with Fraud')
for bar, val in zip(bars, top_10_features.values[::-1]):
    plt.text(val + 0.002, bar.get_y() + bar.get_height()/2, f'{val:.3f}', va='center')
plt.tight_layout()
plt.savefig('../data/processed/top_correlations.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Missing Data Analysis

In [None]:
# Calculate missing percentages
missing_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
high_missing = missing_pct[missing_pct > 50]

print(f"FEATURES WITH >50% MISSING DATA: {len(high_missing)}")
print("="*50)
print(f"\nTotal features: {df.shape[1]}")
print(f"Features >50% missing: {len(high_missing)} ({len(high_missing)/df.shape[1]*100:.1f}%)")
print(f"Features >75% missing: {len(missing_pct[missing_pct > 75])}")
print(f"Features >90% missing: {len(missing_pct[missing_pct > 90])}")

In [None]:
# Visualize missing data distribution
plt.figure(figsize=(12, 5))

# Histogram of missing percentages
plt.subplot(1, 2, 1)
plt.hist(missing_pct, bins=20, color='steelblue', edgecolor='black')
plt.axvline(x=50, color='red', linestyle='--', label='50% threshold')
plt.xlabel('Missing Percentage')
plt.ylabel('Number of Features')
plt.title('Distribution of Missing Values Across Features')
plt.legend()

# Top 15 features with most missing
plt.subplot(1, 2, 2)
top_missing = missing_pct.head(15)
plt.barh(top_missing.index[::-1], top_missing.values[::-1], color='coral')
plt.xlabel('Missing %')
plt.title('Top 15 Features by Missing Data')
plt.axvline(x=50, color='red', linestyle='--')

plt.tight_layout()
plt.savefig('../data/processed/missing_data.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Transaction Amount Distribution by Fraud Status

In [None]:
# Transaction amount statistics
print("TRANSACTION AMOUNT STATISTICS")
print("="*50)
for label, group in df.groupby('isFraud')['TransactionAmt']:
    status = 'Fraud' if label == 1 else 'Legitimate'
    print(f"\n{status} Transactions:")
    print(f"  Mean:   ${group.mean():,.2f}")
    print(f"  Median: ${group.median():,.2f}")
    print(f"  Std:    ${group.std():,.2f}")
    print(f"  Max:    ${group.max():,.2f}")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution plot (log scale for visibility)
for fraud_val, color, label in [(0, '#2ecc71', 'Legitimate'), (1, '#e74c3c', 'Fraud')]:
    data = df[df['isFraud'] == fraud_val]['TransactionAmt']
    axes[0].hist(np.log1p(data), bins=50, alpha=0.6, color=color, label=label, density=True)

axes[0].set_xlabel('Log(Transaction Amount + 1)')
axes[0].set_ylabel('Density')
axes[0].set_title('Transaction Amount Distribution (Log Scale)')
axes[0].legend()

# Box plot comparison
df_sample = df.sample(n=min(50000, len(df)), random_state=42)
sns.boxplot(data=df_sample, x='isFraud', y='TransactionAmt', ax=axes[1], palette=['#2ecc71', '#e74c3c'])
axes[1].set_xticklabels(['Legitimate', 'Fraud'])
axes[1].set_ylabel('Transaction Amount ($)')
axes[1].set_title('Transaction Amount by Fraud Status')
axes[1].set_ylim(0, 1000)  # Focus on main distribution

plt.tight_layout()
plt.savefig('../data/processed/amount_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Temporal Patterns: Fraud by Hour and Day

In [None]:
# TransactionDT is seconds from a reference time
# Extract hour of day and day of week
START_DATE = '2017-12-01'  # Common assumption for IEEE-CIS dataset
df['datetime'] = pd.to_datetime(START_DATE) + pd.to_timedelta(df['TransactionDT'], unit='s')
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek
df['day'] = df['datetime'].dt.day

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Fraud rate by hour
hourly_fraud = df.groupby('hour')['isFraud'].mean() * 100
axes[0].bar(hourly_fraud.index, hourly_fraud.values, color='steelblue', edgecolor='black')
axes[0].axhline(y=fraud_rate, color='red', linestyle='--', label=f'Overall Rate ({fraud_rate:.2f}%)')
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Fraud Rate (%)')
axes[0].set_title('Fraud Rate by Hour of Day')
axes[0].set_xticks(range(0, 24, 2))
axes[0].legend()

# Fraud rate by day of week
daily_fraud = df.groupby('dayofweek')['isFraud'].mean() * 100
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
axes[1].bar(days, daily_fraud.values, color='coral', edgecolor='black')
axes[1].axhline(y=fraud_rate, color='red', linestyle='--', label=f'Overall Rate ({fraud_rate:.2f}%)')
axes[1].set_xlabel('Day of Week')
axes[1].set_ylabel('Fraud Rate (%)')
axes[1].set_title('Fraud Rate by Day of Week')
axes[1].legend()

plt.tight_layout()
plt.savefig('../data/processed/temporal_patterns.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Summary of temporal patterns
print("TEMPORAL FRAUD PATTERNS")
print("="*50)
print(f"\nHighest fraud hours: {hourly_fraud.nlargest(3).index.tolist()}")
print(f"Lowest fraud hours: {hourly_fraud.nsmallest(3).index.tolist()}")
print(f"\nHighest fraud days: {[days[i] for i in daily_fraud.nlargest(3).index.tolist()]}")
print(f"Lowest fraud days: {[days[i] for i in daily_fraud.nsmallest(3).index.tolist()]}")

## 8. Summary Statistics Table

In [None]:
# Create summary statistics
summary = pd.DataFrame({
    'Metric': [
        'Total Transactions', 'Fraud Transactions', 'Fraud Rate (%)',
        'Class Imbalance Ratio', 'Total Features', 'Features >50% Missing',
        'Avg Fraud Amount ($)', 'Avg Legitimate Amount ($)',
        'Peak Fraud Hour', 'Peak Fraud Day'
    ],
    'Value': [
        f"{total_count:,}", f"{fraud_count:,}", f"{fraud_rate:.2f}",
        f"1:{int((total_count-fraud_count)/fraud_count)}", f"{df.shape[1]}", f"{len(high_missing)}",
        f"{df[df['isFraud']==1]['TransactionAmt'].mean():.2f}",
        f"{df[df['isFraud']==0]['TransactionAmt'].mean():.2f}",
        f"{hourly_fraud.idxmax()}:00", f"{days[daily_fraud.idxmax()]}"
    ]
})
print(summary.to_string(index=False))

## 9. Key Findings & Top Features for Feature Engineering

### Key Findings:
1. **Severe class imbalance** - Will need SMOTE or class weights
2. **Many high-missing features** - Consider dropping features >75% missing or using imputation
3. **Amount patterns differ** - Fraudulent transactions show different amount distributions
4. **Temporal signals exist** - Hour and day of week have fraud rate variations

### Top 10 Features for Feature Engineering:

In [None]:
# Save top features list
top_features_list = top_10_features.index.tolist()
print("TOP 10 FEATURES FOR FEATURE ENGINEERING")
print("="*50)
for i, feat in enumerate(top_features_list, 1):
    corr = top_10_features[feat]
    missing = missing_pct.get(feat, 0)
    print(f"{i:2}. {feat:<20} | Corr: {corr:.4f} | Missing: {missing:.1f}%")

# Save to file for reference
pd.DataFrame({'feature': top_features_list, 'correlation': top_10_features.values}).to_csv(
    '../data/processed/top_features.csv', index=False
)
print("\n[Saved to data/processed/top_features.csv]")

---
**Next Steps**: Use these top features as starting point for feature engineering in Phase 2.