# Credit Card Fraud Detection - Exploratory Data Analysis

This notebook contains comprehensive EDA for the Credit Card Fraud Detection dataset from Kaggle.

## Dataset Information
- **Total Transactions**: 284,807
- **Fraud Cases**: 492 (0.17%)
- **Features**: 30 (Time, Amount, V1-V28)
- **Target**: Class (0 = Non-Fraud, 1 = Fraud)


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")


## 1. Load and Inspect Data


In [None]:
# Load dataset
# Note: Update the path to your dataset location
df = pd.read_csv('../data/creditcard.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Basic information
print("Dataset Info:")
print(df.info())
print("\n" + "="*60)
print("\nDataset Statistics:")
df.describe()


In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum().sum())
print("\nNo missing values found!")

# Check class distribution
print("\n" + "="*60)
print("Class Distribution:")
class_counts = df['Class'].value_counts()
print(class_counts)
print(f"\nFraud percentage: {(class_counts[1] / len(df)) * 100:.2f}%")
print(f"Non-Fraud percentage: {(class_counts[0] / len(df)) * 100:.2f}%")


## 2. Class Distribution Visualization


In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(data=df, x='Class', ax=axes[0], palette=['#3498db', '#e74c3c'])
axes[0].set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Class (0=Non-Fraud, 1=Fraud)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(['Non-Fraud', 'Fraud'])

# Pie chart
class_counts.plot(kind='pie', ax=axes[1], autopct='%1.2f%%', 
                  colors=['#3498db', '#e74c3c'], startangle=90)
axes[1].set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('../models/class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("⚠️ HIGHLY IMBALANCED DATASET: Only 0.17% fraud cases!")


## 3. Amount Distribution Analysis


In [None]:
# Amount statistics by class
print("Amount Statistics by Class:")
print(df.groupby('Class')['Amount'].describe())


In [None]:
# Boxplot: Amount vs Class
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Class', y='Amount', palette=['#3498db', '#e74c3c'])
plt.title('Transaction Amount Distribution by Class', fontsize=14, fontweight='bold')
plt.xlabel('Class (0=Non-Fraud, 1=Fraud)', fontsize=12)
plt.ylabel('Amount', fontsize=12)
plt.xticklabels(['Non-Fraud', 'Fraud'])
plt.yscale('log')  # Log scale for better visualization
plt.tight_layout()
plt.savefig('../models/amount_boxplot.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Distribution of Amount
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
df[df['Class'] == 0]['Amount'].hist(bins=50, ax=axes[0], alpha=0.7, label='Non-Fraud', color='#3498db')
df[df['Class'] == 1]['Amount'].hist(bins=50, ax=axes[0], alpha=0.7, label='Fraud', color='#e74c3c')
axes[0].set_title('Amount Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Amount', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].legend()
axes[0].set_yscale('log')

# Log scale histogram
df[df['Class'] == 0]['Amount'].hist(bins=50, ax=axes[1], alpha=0.7, label='Non-Fraud', color='#3498db')
df[df['Class'] == 1]['Amount'].hist(bins=50, ax=axes[1], alpha=0.7, label='Fraud', color='#e74c3c')
axes[1].set_title('Amount Distribution (Log Scale)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Amount (Log)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_xscale('log')
axes[1].legend()

plt.tight_layout()
plt.savefig('../models/amount_distribution.png', dpi=300, bbox_inches='tight')
plt.show()


## 4. Time-based Analysis


In [None]:
# Convert Time to hours (assuming seconds elapsed from first transaction)
df['Hour'] = (df['Time'] / 3600) % 24

# Fraud transactions by hour
fraud_by_hour = df[df['Class'] == 1].groupby('Hour').size()
normal_by_hour = df[df['Class'] == 0].groupby('Hour').size()

plt.figure(figsize=(14, 6))
plt.plot(fraud_by_hour.index, fraud_by_hour.values, marker='o', label='Fraud', color='#e74c3c', linewidth=2)
plt.plot(normal_by_hour.index, normal_by_hour.values, marker='o', label='Non-Fraud', 
         color='#3498db', linewidth=2, alpha=0.5)
plt.title('Transaction Frequency by Hour of Day', fontsize=14, fontweight='bold')
plt.xlabel('Hour of Day', fontsize=12)
plt.ylabel('Number of Transactions', fontsize=12)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('../models/time_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


## 5. Correlation Analysis


In [None]:
# Correlation with target variable
correlations = df.corr()['Class'].sort_values(ascending=False)
print("Top 10 Features Correlated with Fraud:")
print(correlations.head(11))  # Including Class itself
print("\nBottom 10 Features Correlated with Fraud:")
print(correlations.tail(10))


In [None]:
# Correlation matrix (focus on Class)
corr_with_class = df.corr()['Class'].drop('Class').sort_values(ascending=True)

plt.figure(figsize=(10, 12))
corr_with_class.plot(kind='barh', color=['#e74c3c' if x > 0 else '#3498db' for x in corr_with_class.values])
plt.title('Feature Correlation with Fraud (Class)', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
plt.tight_layout()
plt.savefig('../models/correlation_with_class.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Full correlation matrix heatmap (sample of features for readability)
# Select most important features
important_features = ['V1', 'V2', 'V3', 'V4', 'V7', 'V9', 'V10', 'V11', 'V12', 
                      'V14', 'V16', 'V17', 'V18', 'V19', 'Amount', 'Class']

plt.figure(figsize=(12, 10))
corr_matrix = df[important_features].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix (Key Features)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../models/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()


## 6. Feature Distribution Analysis


In [None]:
# Distribution of top correlated features
top_features = ['V14', 'V12', 'V10', 'V16', 'V3', 'V7', 'V11', 'V4']

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()

for idx, feature in enumerate(top_features):
    df[df['Class'] == 0][feature].hist(bins=50, ax=axes[idx], alpha=0.7, 
                                       label='Non-Fraud', color='#3498db', density=True)
    df[df['Class'] == 1][feature].hist(bins=50, ax=axes[idx], alpha=0.7, 
                                       label='Fraud', color='#e74c3c', density=True)
    axes[idx].set_title(f'{feature} Distribution', fontsize=10, fontweight='bold')
    axes[idx].set_xlabel(feature, fontsize=9)
    axes[idx].set_ylabel('Density', fontsize=9)
    axes[idx].legend(fontsize=8)

plt.tight_layout()
plt.savefig('../models/feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()


## 7. Key Insights Summary

### Findings:
1. **Highly Imbalanced Dataset**: Only 0.17% of transactions are fraudulent
2. **Amount Distribution**: Fraud transactions tend to have different amount patterns
3. **Time Patterns**: Fraud may occur more frequently at certain hours
4. **Feature Correlations**: V14, V12, V10, V16 show strong negative correlation with fraud
5. **PCA Features**: V1-V28 are PCA-transformed features (privacy protection)

### Recommendations:
- Use SMOTE or other oversampling techniques
- Apply class weights in models
- Focus on features with high correlation to fraud
- Consider ensemble methods (XGBoost, Random Forest)
- Target AUC-ROC > 0.97 and Precision > 92% for fraud class
