In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')


In [None]:
# Create a sample dataset for demonstration (similar to the Credit Card Fraud Detection dataset)
np.random.seed(42)
n_samples = 10000
n_features = 30

# Create feature columns (V1-V28 plus Time and Amount)
cols = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

# Generate feature data
X = np.random.randn(n_samples, n_features)

# Generate target variable (fraud=1, normal=0) with imbalance (0.2% fraud)
fraud_ratio = 0.002
n_fraud = int(n_samples * fraud_ratio)
y = np.zeros(n_samples)
fraud_indices = np.random.choice(range(n_samples), size=n_fraud, replace=False)
y[fraud_indices] = 1

# Create DataFrame
df = pd.DataFrame(X, columns=cols)
df['Class'] = y

# Make Time and Amount more realistic
df['Time'] = np.random.uniform(0, 172800, n_samples)  # Time in seconds (2 days)
df['Amount'] = np.exp(np.random.normal(3, 1, n_samples))  # Log-normal distribution for amounts

# Display info about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Number of fraudulent transactions: {df['Class'].sum()}")
print(f"Fraud percentage: {df['Class'].mean() * 100:.3f}%")

# Display first few rows
df.head()


In [None]:
# Check basic statistics and missing values
print("Basic statistics for Amount:")
print(df['Amount'].describe())

print("\nBasic statistics for Time:")
print(df['Time'].describe())

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found")

# Create some artificial missing values for demonstration purposes
df.loc[np.random.choice(df.index, 50), 'V1'] = np.nan
df.loc[np.random.choice(df.index, 30), 'V2'] = np.nan
df.loc[np.random.choice(df.index, 20), 'Amount'] = np.nan

print("\nAfter introducing artificial missing values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


In [None]:
# Function for feature engineering
def add_features(df):
    """Add engineered features to the dataframe."""
    df = df.copy()
    
    # Create time-based features
    df['Hour'] = df['Time'] // 3600  # Convert seconds to hours
    df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)  # Cyclical encoding
    df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)  # Cyclical encoding
    
    # Log transform for Amount (common for financial data)
    df['Amount_log'] = np.log1p(df['Amount'])
    
    # Create interaction features between selected V features
    df['V1_V2'] = df['V1'] * df['V2']
    df['V1_V3'] = df['V1'] * df['V3']
    df['V2_V3'] = df['V2'] * df['V3']
    
    return df

# Demonstrate feature engineering on a small sample
sample_df = add_features(df.head(5))
print("Original features:")
print(df.head(5)[['Time', 'Amount', 'V1', 'V2', 'V3']].to_string())
print("\nWith engineered features:")
print(sample_df[['Time', 'Hour', 'Hour_sin', 'Hour_cos', 'Amount', 'Amount_log', 'V1_V2', 'V1_V3', 'V2_V3']].to_string())


In [None]:
# Split the data before preprocessing
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(f"Training set: {X_train.shape[0]} samples, {y_train.sum()} frauds ({y_train.mean() * 100:.3f}%)")
print(f"Test set: {X_test.shape[0]} samples, {y_test.sum()} frauds ({y_test.mean() * 100:.3f}%)")


In [None]:
# Define column types
amount_columns = ['Amount']
time_columns = ['Time']
v_columns = [col for col in X_train.columns if col.startswith('V')]

# 1. Feature Engineering Step (outside the main pipeline)
X_train_feat = add_features(X_train)
X_test_feat = add_features(X_test)

# Get lists of the new feature columns
hour_columns = ['Hour', 'Hour_sin', 'Hour_cos']
amount_derived_columns = ['Amount_log']
interaction_columns = ['V1_V2', 'V1_V3', 'V2_V3']

# Define the preprocessing steps for different column types
amount_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

time_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

v_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

hour_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Combine all preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('amount', amount_processor, amount_columns + amount_derived_columns),
        ('time', time_processor, time_columns),
        ('v_features', v_processor, v_columns + interaction_columns),
        ('hour', hour_processor, hour_columns)
    ]
)

# Apply preprocessing to the training and test sets
X_train_preprocessed = preprocessor.fit_transform(X_train_feat)
X_test_preprocessed = preprocessor.transform(X_test_feat)

print(f"Preprocessed training data shape: {X_train_preprocessed.shape}")
print(f"Preprocessed test data shape: {X_test_preprocessed.shape}")


In [None]:
# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

# Check the new class distribution
print("Class distribution before SMOTE:")
print(f"Class 0 (Normal): {(y_train == 0).sum()} ({(1 - y_train.mean()) * 100:.2f}%)")
print(f"Class 1 (Fraud): {(y_train == 1).sum()} ({y_train.mean() * 100:.2f}%)")

print("\nClass distribution after SMOTE:")
print(f"Class 0 (Normal): {(y_train_resampled == 0).sum()} ({(1 - y_train_resampled.mean()) * 100:.2f}%)")
print(f"Class 1 (Fraud): {(y_train_resampled == 1).sum()} ({y_train_resampled.mean() * 100:.2f}%)")

print(f"\nResampled training data shape: {X_train_resampled.shape}")


In [None]:
# Define a custom transformer for feature engineering
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        # Create time-based features
        X_copy['Hour'] = X_copy['Time'] // 3600  # Convert seconds to hours
        X_copy['Hour_sin'] = np.sin(2 * np.pi * X_copy['Hour'] / 24)  # Cyclical encoding
        X_copy['Hour_cos'] = np.cos(2 * np.pi * X_copy['Hour'] / 24)  # Cyclical encoding
        
        # Log transform for Amount (common for financial data)
        X_copy['Amount_log'] = np.log1p(X_copy['Amount'])
        
        # Create interaction features between selected V features
        X_copy['V1_V2'] = X_copy['V1'] * X_copy['V2']
        X_copy['V1_V3'] = X_copy['V1'] * X_copy['V3']
        X_copy['V2_V3'] = X_copy['V2'] * X_copy['V3']
        
        return X_copy

# Now we can create a full pipeline including feature engineering
# Reset and redefine the column lists
amount_columns = ['Amount']
time_columns = ['Time']
v_columns = [col for col in X.columns if col.startswith('V')]
hour_columns = ['Hour', 'Hour_sin', 'Hour_cos']
amount_derived_columns = ['Amount_log']
interaction_columns = ['V1_V2', 'V1_V3', 'V2_V3']

# Complete preprocessing pipeline
full_preprocessor = Pipeline(steps=[
    ('feature_engineer', FeatureEngineer()),
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('amount', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', RobustScaler())
            ]), ['Amount', 'Amount_log']),
            
            ('time', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), ['Time']),
            
            ('hour', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), ['Hour', 'Hour_sin', 'Hour_cos']),
            
            ('v_features', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), v_columns + ['V1_V2', 'V1_V3', 'V2_V3'])
        ]
    ))
])

# Apply the full preprocessing pipeline
X_train_full_processed = full_preprocessor.fit_transform(X_train)
X_test_full_processed = full_preprocessor.transform(X_test)

print(f"Fully processed training data shape: {X_train_full_processed.shape}")
print(f"Fully processed test data shape: {X_test_full_processed.shape}")


In [None]:
# Complete preprocessing pipeline with SMOTE
# We'll use imbalanced-learn's Pipeline which supports SMOTE
imb_pipeline = ImbPipeline(steps=[
    ('preprocessor', full_preprocessor),
    ('smote', SMOTE(random_state=42))
])

# Apply the imbalanced-learn pipeline (this will apply both preprocessing and SMOTE)
X_train_processed_balanced, y_train_balanced = imb_pipeline.fit_resample(X_train, y_train)

print("Final preprocessed and balanced data:")
print(f"Training data shape: {X_train_processed_balanced.shape}")
print(f"Class distribution in balanced training set:")
print(f"Class 0 (Normal): {(y_train_balanced == 0).sum()} ({(1 - y_train_balanced.mean()) * 100:.2f}%)")
print(f"Class 1 (Fraud): {(y_train_balanced == 1).sum()} ({y_train_balanced.mean() * 100:.2f}%)")


In [None]:
# Create a simple visualization of our pipeline
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle, FancyArrowPatch

plt.figure(figsize=(12, 8))

# Helper function to add an arrow
def add_arrow(ax, x1, y1, x2, y2, width=0.03):
    arrow = FancyArrowPatch((x1, y1), (x2, y2), 
                           arrowstyle='->', 
                           color='black', 
                           linewidth=1, 
                           mutation_scale=20)
    ax.add_patch(arrow)

ax = plt.gca()
ax.set_xlim(0, 10)
ax.set_ylim(0, 7)
ax.axis('off')

# Draw Raw Data
rect = Rectangle((0.5, 5.5), 2, 1, facecolor='lightblue', edgecolor='black')
ax.add_patch(rect)
ax.text(1.5, 6, 'Raw Data', ha='center', va='center', fontsize=12)

# Draw Feature Engineering
rect = Rectangle((3.5, 5.5), 2, 1, facecolor='lightgreen', edgecolor='black')
ax.add_patch(rect)
ax.text(4.5, 6, 'Feature Engineering', ha='center', va='center', fontsize=12)

# Draw Missing Value Imputation
rect = Rectangle((6.5, 5.5), 2, 1, facecolor='lightcoral', edgecolor='black')
ax.add_patch(rect)
ax.text(7.5, 6, 'Missing Value\nImputation', ha='center', va='center', fontsize=12)

# Draw Scaling
rect = Rectangle((3.5, 3.5), 2, 1, facecolor='lightyellow', edgecolor='black')
ax.add_patch(rect)
ax.text(4.5, 4, 'Feature Scaling', ha='center', va='center', fontsize=12)

# Draw SMOTE
rect = Rectangle((6.5, 3.5), 2, 1, facecolor='lavender', edgecolor='black')
ax.add_patch(rect)
ax.text(7.5, 4, 'SMOTE\nResampling', ha='center', va='center', fontsize=12)

# Draw Processed Data
rect = Rectangle((4.5, 1.5), 2, 1, facecolor='lightgreen', edgecolor='black')
ax.add_patch(rect)
ax.text(5.5, 2, 'Processed Data\nReady for Models', ha='center', va='center', fontsize=12)

# Add arrows
add_arrow(ax, 2.5, 6, 3.5, 6)  # Raw to Feature Engineering
add_arrow(ax, 5.5, 6, 6.5, 6)  # Feature Engineering to Missing Value Imputation
add_arrow(ax, 7.5, 5.5, 7.5, 4.5)  # Missing Value Imputation to SMOTE
add_arrow(ax, 6.5, 4, 5.5, 4)  # SMOTE to Scaling
add_arrow(ax, 4.5, 3.5, 4.5, 2.5)  # Scaling to Processed Data
add_arrow(ax, 6.5, 3.5, 5.5, 2.5)  # SMOTE to Processed Data

plt.title('Credit Card Fraud Detection Preprocessing Pipeline', fontsize=14)
plt.tight_layout()
plt.show()
