# üí≥ Credit Card Fraud Detection

**Project**: Binary Classification - Imbalanced Dataset  
**Level**: Intermediate  
**Dataset**: Credit Card Fraud Dataset  

## üìã Project Overview

This project detects fraudulent credit card transactions using machine learning. We'll learn:

- Handling severely imbalanced datasets
- Advanced sampling techniques (SMOTE, ADASYN)
- Evaluation metrics for imbalanced data
- Cost-sensitive learning
- Anomaly detection techniques

Let's build a fraud detection system! üîç

## 1. Import Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix, classification_report
)

# Imbalanced learning
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("‚úÖ All libraries imported successfully!")
print("üí≥ Ready for fraud detection analysis!")

## 2. Data Generation and Exploration

In [None]:
# Generate synthetic credit card transaction data
# (In practice, you would load the actual credit card fraud dataset)
np.random.seed(42)
n_samples = 10000
n_fraud = int(0.002 * n_samples)  # 0.2% fraud rate (realistic)

print(f"üè¶ Generating synthetic credit card transaction data...")
print(f"Total transactions: {n_samples:,}")
print(f"Fraudulent transactions: {n_fraud} ({n_fraud/n_samples:.1%})")

# Create normal transactions
normal_data = {
    'Time': np.random.uniform(0, 172800, n_samples - n_fraud),  # 48 hours in seconds
    'Amount': np.random.lognormal(3, 1.5, n_samples - n_fraud).clip(1, 5000),
}

# Add PCA features (V1-V10 for simplicity)
for i in range(1, 11):
    normal_data[f'V{i}'] = np.random.normal(0, 1, n_samples - n_fraud)

normal_data['Class'] = np.zeros(n_samples - n_fraud)

# Create fraudulent transactions (different patterns)
fraud_data = {
    'Time': np.random.uniform(0, 172800, n_fraud),
    'Amount': np.random.lognormal(2, 2, n_fraud).clip(1, 10000),  # Different amount pattern
}

# Fraudulent transactions have different PCA patterns
for i in range(1, 11):
    fraud_data[f'V{i}'] = np.random.normal(2 if i <= 5 else -1, 1.5, n_fraud)

fraud_data['Class'] = np.ones(n_fraud)

# Combine data
all_data = {}
for key in normal_data.keys():
    all_data[key] = np.concatenate([normal_data[key], fraud_data[key]])

df = pd.DataFrame(all_data)

# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nüí≥ Credit card dataset created!")
print(f"Dataset shape: {df.shape}")
print(f"Features: {list(df.columns[:-1])}")
print(f"Target: Class (0=Normal, 1=Fraud)")

In [None]:
# Dataset information
print("üìä Dataset Information:")
print(f"Total transactions: {len(df):,}")
print(f"Features: {df.shape[1] - 1}")
print(f"Missing values: {df.isnull().sum().sum()}")

fraud_count = df['Class'].sum()
normal_count = len(df) - fraud_count
fraud_rate = fraud_count / len(df)

print(f"\nüö® Class Distribution:")
print(f"‚Ä¢ Normal transactions: {normal_count:,} ({(1-fraud_rate):.1%})")
print(f"‚Ä¢ Fraudulent transactions: {fraud_count:,} ({fraud_rate:.1%})")
print(f"‚Ä¢ Imbalance ratio: {normal_count/fraud_count:.1f}:1")

print(f"\nüí∞ Transaction Amount Statistics:")
print(f"‚Ä¢ Normal transactions - Mean: ${df[df['Class']==0]['Amount'].mean():.2f}")
print(f"‚Ä¢ Fraudulent transactions - Mean: ${df[df['Class']==1]['Amount'].mean():.2f}")

print("\nüìà Statistical Summary:")
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Class distribution visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('üö® Credit Card Fraud Analysis', fontsize=16, fontweight='bold')

# Class distribution pie chart
class_counts = df['Class'].value_counts()
axes[0].pie(class_counts.values, labels=['Normal', 'Fraud'], 
           autopct='%1.3f%%', startangle=90, colors=['#4ECDC4', '#FF6B6B'])
axes[0].set_title('üí≥ Transaction Distribution')

# Amount distribution by class
normal_amounts = df[df['Class'] == 0]['Amount']
fraud_amounts = df[df['Class'] == 1]['Amount']

axes[1].hist(normal_amounts, bins=50, alpha=0.7, label='Normal', 
            color='#4ECDC4', density=True)
axes[1].hist(fraud_amounts, bins=50, alpha=0.7, label='Fraud', 
            color='#FF6B6B', density=True)
axes[1].set_xlabel('Transaction Amount ($)')
axes[1].set_ylabel('Density')
axes[1].set_title('üí∞ Amount Distribution by Class')
axes[1].legend()
axes[1].set_xlim(0, 1000)  # Focus on lower amounts for visibility

# Time distribution
axes[2].hist(df[df['Class'] == 0]['Time'], bins=50, alpha=0.7, 
            label='Normal', color='#4ECDC4', density=True)
axes[2].hist(df[df['Class'] == 1]['Time'], bins=50, alpha=0.7, 
            label='Fraud', color='#FF6B6B', density=True)
axes[2].set_xlabel('Time (seconds)')
axes[2].set_ylabel('Density')
axes[2].set_title('‚è∞ Time Distribution by Class')
axes[2].legend()

plt.tight_layout()
plt.show()

print(f"üìä Key Observations:")
print(f"‚Ä¢ Extreme class imbalance: {normal_count/fraud_count:.1f}:1 ratio")
print(f"‚Ä¢ Average fraud amount: ${fraud_amounts.mean():.2f}")
print(f"‚Ä¢ Average normal amount: ${normal_amounts.mean():.2f}")
print(f"‚Ä¢ This represents a realistic fraud detection challenge!")

In [None]:
# Feature analysis - PCA components
fig, axes = plt.subplots(2, 5, figsize=(20, 10))
fig.suptitle('üîç PCA Feature Analysis by Class', fontsize=16, fontweight='bold')

pca_features = [f'V{i}' for i in range(1, 11)]

for i, feature in enumerate(pca_features):
    row, col = i // 5, i % 5
    
    # Plot distributions
    normal_values = df[df['Class'] == 0][feature]
    fraud_values = df[df['Class'] == 1][feature]
    
    axes[row, col].hist(normal_values, bins=30, alpha=0.7, 
                       label='Normal', color='#4ECDC4', density=True)
    axes[row, col].hist(fraud_values, bins=30, alpha=0.7, 
                       label='Fraud', color='#FF6B6B', density=True)
    
    axes[row, col].set_title(f'{feature}', fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Density')
    axes[row, col].legend()
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Calculate feature separability
print("üîç Feature Separability Analysis:")
separability_scores = []
for feature in pca_features:
    normal_mean = df[df['Class'] == 0][feature].mean()
    fraud_mean = df[df['Class'] == 1][feature].mean()
    combined_std = df[feature].std()
    separability = abs(normal_mean - fraud_mean) / combined_std
    separability_scores.append((feature, separability))

separability_scores.sort(key=lambda x: x[1], reverse=True)
print("Top 5 most separable features:")
for feature, score in separability_scores[:5]:
    print(f"‚Ä¢ {feature}: {score:.3f}")