In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')


In [None]:
# Normally, you would download the dataset from Kaggle or another source
# For this example, we'll simulate loading it (using a smaller sample for demonstration)
# In a real project, you would use:
# df = pd.read_csv('creditcard.csv')

# Create a sample dataset for demonstration
np.random.seed(42)
n_samples = 10000
n_features = 30

# Create feature columns (V1-V28 plus Time and Amount)
cols = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

# Generate feature data
X = np.random.randn(n_samples, n_features)

# Generate target variable (fraud=1, normal=0) with imbalance (0.2% fraud)
fraud_ratio = 0.002
n_fraud = int(n_samples * fraud_ratio)
y = np.zeros(n_samples)
fraud_indices = np.random.choice(range(n_samples), size=n_fraud, replace=False)
y[fraud_indices] = 1

# Create DataFrame
df = pd.DataFrame(X, columns=cols)
df['Class'] = y

# Make Time and Amount more realistic
df['Time'] = np.random.uniform(0, 172800, n_samples)  # Time in seconds (2 days)
df['Amount'] = np.exp(np.random.normal(3, 1, n_samples))  # Log-normal distribution for amounts

# Display info about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Number of fraudulent transactions: {df['Class'].sum()}")
print(f"Fraud percentage: {df['Class'].mean() * 100:.3f}%")

# Display first few rows
df.head()


In [None]:
# Define project structure
project_structure = {
    'data': ['raw', 'processed', 'interim'],
    'notebooks': ['01_exploratory_analysis.ipynb', 
                 '02_preprocessing_pipeline.ipynb',
                 '03_baseline_models.ipynb',
                 '04_model_comparison.ipynb',
                 '05_advanced_optimization.ipynb',
                 '06_deployment_pipeline.ipynb'],
    'src': ['data', 'features', 'models', 'visualization', 'utils'],
    'models': ['saved_models'],
    'reports': ['figures', 'final_report.md'],
    'app': ['api.py', 'templates', 'static']
}

# Print project structure
print("Credit Card Fraud Detection Project Structure:")
print("=" * 50)
for directory, contents in project_structure.items():
    print(f"/{directory}")
    for item in contents:
        print(f"  ├── {item}")
    print()

# In a real project, you would create these directories
# But we'll skip that for this notebook


In [None]:
# Basic statistics
print("Dataset summary statistics:")
print(df.describe().T)

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found")


In [None]:
# Visualize class distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Class', data=df)
plt.title('Class Distribution (0: Normal, 1: Fraud)')
plt.xlabel('Class')
plt.ylabel('Count')

# Add count labels on the bars
for p in plt.gca().patches:
    plt.gca().annotate(f'{p.get_height()}', 
                      (p.get_x() + p.get_width() / 2., p.get_height()), 
                      ha='center', va='center', 
                      xytext=(0, 10), 
                      textcoords='offset points')

plt.show()

# Show percentage
print(f"Normal transactions: {(df['Class'] == 0).sum()} ({(1 - df['Class'].mean()) * 100:.3f}%)")
print(f"Fraudulent transactions: {(df['Class'] == 1).sum()} ({df['Class'].mean() * 100:.3f}%)")


In [None]:
# Analyze transaction amounts
plt.figure(figsize=(12, 5))

# Normal transactions
plt.subplot(1, 2, 1)
sns.histplot(df[df['Class'] == 0]['Amount'], bins=50, kde=True)
plt.title('Amount Distribution - Normal Transactions')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.xlim(0, 500)  # Focus on the main distribution

# Fraudulent transactions
plt.subplot(1, 2, 2)
sns.histplot(df[df['Class'] == 1]['Amount'], bins=50, kde=True, color='red')
plt.title('Amount Distribution - Fraudulent Transactions')
plt.xlabel('Transaction Amount')
plt.ylabel('Frequency')
plt.xlim(0, 500)  # Focus on the main distribution

plt.tight_layout()
plt.show()

# Compare statistics of normal vs fraud transactions
print("Normal transactions amount statistics:")
print(df[df['Class'] == 0]['Amount'].describe())

print("\nFraudulent transactions amount statistics:")
print(df[df['Class'] == 1]['Amount'].describe())


In [None]:
# Analyze time distribution
plt.figure(figsize=(12, 5))

# Convert time to hours for better interpretation
df['TimeHour'] = df['Time'] / 3600  # Convert seconds to hours

# Plot time distribution by class
plt.subplot(1, 2, 1)
sns.histplot(df[df['Class'] == 0]['TimeHour'], bins=48, kde=True, 
            label='Normal', alpha=0.7)
plt.title('Time Distribution - Normal Transactions')
plt.xlabel('Time (hours)')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.histplot(df[df['Class'] == 1]['TimeHour'], bins=48, kde=True, 
            color='red', label='Fraud', alpha=0.7)
plt.title('Time Distribution - Fraudulent Transactions')
plt.xlabel('Time (hours)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

# Check correlation between Time and Fraud
time_fraud_corr = df[['TimeHour', 'Class']].corr().iloc[0, 1]
print(f"Correlation between Time and Fraud: {time_fraud_corr:.4f}")


In [None]:
# Feature correlation with target
# Select only a subset of features for clarity
selected_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'Amount', 'Class']
correlation = df[selected_features].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix for Selected Features')
plt.tight_layout()
plt.show()

# Find top features correlated with fraud
feature_corrs = df.drop(['TimeHour'], axis=1).corr()['Class'].sort_values(ascending=False)
print("Top positively correlated features with fraud:")
print(feature_corrs.head())
print("\nTop negatively correlated features with fraud:")
print(feature_corrs.tail())


In [None]:
# Prepare features and target
X = df.drop(['Class', 'TimeHour'], axis=1)
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Print split information
print(f"Training set: {X_train.shape[0]} samples")
print(f"  - Normal transactions: {(y_train == 0).sum()} ({(y_train == 0).sum() / len(y_train) * 100:.2f}%)")
print(f"  - Fraudulent transactions: {(y_train == 1).sum()} ({(y_train == 1).sum() / len(y_train) * 100:.2f}%)")
print(f"\nTest set: {X_test.shape[0]} samples")
print(f"  - Normal transactions: {(y_test == 0).sum()} ({(y_test == 0).sum() / len(y_test) * 100:.2f}%)")
print(f"  - Fraudulent transactions: {(y_test == 1).sum()} ({(y_test == 1).sum() / len(y_test) * 100:.2f}%)")
