In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                            confusion_matrix, classification_report, roc_auc_score,
                            roc_curve, precision_recall_curve, average_precision_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                             VotingClassifier, AdaBoostClassifier)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import scipy.stats as stats

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')


In [None]:
# Create a sample dataset for demonstration (similar to the Credit Card Fraud Detection dataset)
np.random.seed(42)
n_samples = 10000
n_features = 30

# Create feature columns (V1-V28 plus Time and Amount)
cols = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

# Generate feature data
X = np.random.randn(n_samples, n_features)

# Generate target variable (fraud=1, normal=0) with imbalance (0.2% fraud)
fraud_ratio = 0.002
n_fraud = int(n_samples * fraud_ratio)
y = np.zeros(n_samples)
fraud_indices = np.random.choice(range(n_samples), size=n_fraud, replace=False)
y[fraud_indices] = 1

# Create DataFrame
df = pd.DataFrame(X, columns=cols)
df['Class'] = y

# Make Time and Amount more realistic
df['Time'] = np.random.uniform(0, 172800, n_samples)  # Time in seconds (2 days)
df['Amount'] = np.exp(np.random.normal(3, 1, n_samples))  # Log-normal distribution for amounts

# Display info about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Number of fraudulent transactions: {df['Class'].sum()}")
print(f"Fraud percentage: {df['Class'].mean() * 100:.3f}%")


In [None]:
# Define feature engineering function
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        # Create time-based features
        X_copy['Hour'] = X_copy['Time'] // 3600  # Convert seconds to hours
        X_copy['Hour_sin'] = np.sin(2 * np.pi * X_copy['Hour'] / 24)  # Cyclical encoding
        X_copy['Hour_cos'] = np.cos(2 * np.pi * X_copy['Hour'] / 24)  # Cyclical encoding
        
        # Log transform for Amount (common for financial data)
        X_copy['Amount_log'] = np.log1p(X_copy['Amount'])
        
        # Create interaction features between selected V features
        X_copy['V1_V2'] = X_copy['V1'] * X_copy['V2']
        X_copy['V1_V3'] = X_copy['V1'] * X_copy['V3']
        X_copy['V2_V3'] = X_copy['V2'] * X_copy['V3']
        
        return X_copy

# Split the data
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(f"Training set: {X_train.shape[0]} samples, {y_train.sum()} frauds ({y_train.mean() * 100:.3f}%)")
print(f"Test set: {X_test.shape[0]} samples, {y_test.sum()} frauds ({y_test.mean() * 100:.3f}%)")

# Define column types
amount_columns = ['Amount']
time_columns = ['Time']
v_columns = [col for col in X.columns if col.startswith('V')]
hour_columns = ['Hour', 'Hour_sin', 'Hour_cos']
amount_derived_columns = ['Amount_log']
interaction_columns = ['V1_V2', 'V1_V3', 'V2_V3']

# Create preprocessing pipeline
preprocessor = Pipeline(steps=[
    ('feature_engineer', FeatureEngineer()),
    ('column_transformer', ColumnTransformer(
        transformers=[
            ('amount', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', RobustScaler())
            ]), ['Amount', 'Amount_log']),
            
            ('time', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), ['Time']),
            
            ('hour', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), ['Hour', 'Hour_sin', 'Hour_cos']),
            
            ('v_features', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]), v_columns + ['V1_V2', 'V1_V3', 'V2_V3'])
        ]
    ))
])

# Process the training and test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Apply SMOTE to the processed training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

print(f"Processed training data shape: {X_train_processed.shape}")
print(f"Resampled training data shape: {X_train_resampled.shape}")
print(f"Processed test data shape: {X_test_processed.shape}")

# Print class distribution after resampling
print("\nClass distribution after SMOTE:")
print(f"Class 0 (Normal): {(y_train_resampled == 0).sum()} ({(1 - y_train_resampled.mean()) * 100:.2f}%)")
print(f"Class 1 (Fraud): {(y_train_resampled == 1).sum()} ({y_train_resampled.mean() * 100:.2f}%)")


In [None]:
# Function to evaluate a model with various metrics
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """
    Evaluate a model on the test set and return various metrics.
    
    Parameters:
    -----------
    model : scikit-learn model
        The trained model to evaluate
    X_train : array-like
        Training features
    X_test : array-like
        Test features
    y_train : array-like
        Training labels
    y_test : array-like
        Test labels
    model_name : str
        Name of the model for reporting
    
    Returns:
    --------
    dict
        Dictionary containing evaluation metrics
    """
    # Make predictions
    y_pred = model.predict(X_test)
    
    # For ROC and precision-recall curves, we need probabilities
    try:
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    except:
        # Some models might not have predict_proba
        y_pred_proba = None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # ROC AUC
    if y_pred_proba is not None:
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        pr_auc = average_precision_score(y_test, y_pred_proba)
    else:
        roc_auc = None
        pr_auc = None
    
    # Return metrics dictionary for later comparison
    metrics_dict = {
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    return metrics_dict

def display_model_metrics(metrics_dict):
    """
    Display metrics and visualizations for a model evaluation.
    
    Parameters:
    -----------
    metrics_dict : dict
        Dictionary containing evaluation metrics from evaluate_model
    """
    model_name = metrics_dict['model_name']
    y_pred = metrics_dict['y_pred']
    y_pred_proba = metrics_dict['y_pred_proba']
    
    # Print results
    print(f"=== {model_name} Evaluation ===")
    print(f"Accuracy: {metrics_dict['accuracy']:.4f}")
    print(f"Precision: {metrics_dict['precision']:.4f}")
    print(f"Recall: {metrics_dict['recall']:.4f}")
    print(f"F1 Score: {metrics_dict['f1']:.4f}")
    if metrics_dict['roc_auc'] is not None:
        print(f"ROC AUC: {metrics_dict['roc_auc']:.4f}")
        print(f"PR AUC: {metrics_dict['pr_auc']:.4f}")
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Visualize confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'Fraud'], 
                yticklabels=['Normal', 'Fraud'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()
    
    # Plot ROC curve if probabilities are available
    if y_pred_proba is not None:
        plt.figure(figsize=(8, 6))
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {metrics_dict["roc_auc"]:.4f})')
        plt.plot([0, 1], [0, 1], 'k--', label='Random')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend()
        plt.show()
        
        # Plot Precision-Recall curve
        plt.figure(figsize=(8, 6))
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
        plt.plot(recall_curve, precision_curve, label=f'PR Curve (AUC = {metrics_dict["pr_auc"]:.4f})')
        # Add a line for the baseline (percentage of positive class)
        plt.axhline(y=y_test.mean(), color='r', linestyle='--', label=f'Baseline ({y_test.mean():.4f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve - {model_name}')
        plt.legend()
        plt.show()


In [None]:
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(C=1.0, max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'SVM (Linear)': SVC(kernel='linear', C=1.0, probability=True, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

# Create a function for cross-validation
def cross_validate_models(models, X, y, cv=5, scoring='f1'):
    """
    Perform cross-validation on multiple models and return results.
    
    Parameters:
    -----------
    models : dict
        Dictionary of model name -> model
    X : array-like
        Features
    y : array-like
        Target
    cv : int or cross-validation generator
        Cross-validation strategy
    scoring : str
        Scoring metric
    
    Returns:
    --------
    dict
        Dictionary of model name -> CV scores
    """
    cv_results = {}
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    
    for name, model in models.items():
        print(f"Cross-validating {name}...")
        scores = cross_val_score(model, X, y, cv=skf, scoring=scoring)
        cv_results[name] = scores
        print(f"  Mean {scoring}: {scores.mean():.4f} (±{scores.std():.4f})\n")
    
    return cv_results

# Perform cross-validation with F1 score
cv_results_f1 = cross_validate_models(models, X_train_resampled, y_train_resampled, cv=5, scoring='f1')


In [None]:
# Visualize cross-validation results
cv_means = {name: scores.mean() for name, scores in cv_results_f1.items()}
cv_stds = {name: scores.std() for name, scores in cv_results_f1.items()}

# Sort by mean F1 score
cv_means_sorted = {k: v for k, v in sorted(cv_means.items(), key=lambda item: item[1], reverse=True)}
cv_stds_sorted = {k: cv_stds[k] for k in cv_means_sorted.keys()}

# Plot
plt.figure(figsize=(12, 6))
plt.bar(range(len(cv_means_sorted)), list(cv_means_sorted.values()), 
        yerr=list(cv_stds_sorted.values()),
        tick_label=list(cv_means_sorted.keys()))
plt.xticks(rotation=45, ha='right')
plt.title('Cross-Validation Results (F1 Score)')
plt.ylabel('Mean F1 Score')
plt.ylim(0.7, 1.0)
plt.tight_layout()
plt.show()

# Create a DataFrame for better visualization
cv_df = pd.DataFrame({
    'Model': list(cv_means_sorted.keys()),
    'Mean F1': list(cv_means_sorted.values()),
    'Std F1': list(cv_stds_sorted.values())
})

# Display table of results
cv_df


In [None]:
# Get the top 3 models based on cross-validation
top_models = list(cv_means_sorted.keys())[:3]
print(f"Top 3 models: {top_models}")

# Perform paired t-tests between the top models
print("\nPaired t-tests for statistical significance:")
for i in range(len(top_models)):
    for j in range(i+1, len(top_models)):
        model_i = top_models[i]
        model_j = top_models[j]
        
        # Perform paired t-test
        t_stat, p_value = stats.ttest_rel(cv_results_f1[model_i], cv_results_f1[model_j])
        
        # Determine if the difference is significant (α = 0.05)
        significant = "significant" if p_value < 0.05 else "not significant"
        
        print(f"{model_i} vs {model_j}:")
        print(f"  t-statistic: {t_stat:.4f}")
        print(f"  p-value: {p_value:.4f}")
        print(f"  Difference is {significant}")
        print("")


In [None]:
# Train and evaluate top models
top_metrics = []

for model_name in top_models:
    print(f"\nTraining and evaluating {model_name}...")
    model = models[model_name]
    model.fit(X_train_resampled, y_train_resampled)
    
    # Evaluate model
    metrics = evaluate_model(model, X_train_resampled, X_test_processed, y_train_resampled, y_test, model_name)
    top_metrics.append(metrics)
    
    # Display metrics
    display_model_metrics(metrics)


In [None]:
# Compare ROC curves of top models
plt.figure(figsize=(10, 8))

for metrics in top_metrics:
    model_name = metrics['model_name']
    y_pred_proba = metrics['y_pred_proba']
    
    if y_pred_proba is not None:
        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        roc_auc = metrics['roc_auc']
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.4f})')

# Add random baseline
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

# Compare Precision-Recall curves
plt.figure(figsize=(10, 8))

for metrics in top_metrics:
    model_name = metrics['model_name']
    y_pred_proba = metrics['y_pred_proba']
    
    if y_pred_proba is not None:
        precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred_proba)
        pr_auc = metrics['pr_auc']
        plt.plot(recall_curve, precision_curve, label=f'{model_name} (AUC = {pr_auc:.4f})')

# Add baseline
plt.axhline(y=y_test.mean(), color='r', linestyle='--', label=f'Baseline ({y_test.mean():.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve Comparison')
plt.legend(loc='upper right')
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# Create voting classifier with top models
top_model_objects = [models[model_name] for model_name in top_models]
voting_clf = VotingClassifier(
    estimators=[(model_name, model) for model_name, model in zip(top_models, top_model_objects)],
    voting='soft'  # Use probability estimates for voting
)

# Train the voting classifier
print("Training Voting Classifier...")
voting_clf.fit(X_train_resampled, y_train_resampled)

# Evaluate the voting classifier
voting_metrics = evaluate_model(
    voting_clf, X_train_resampled, X_test_processed, 
    y_train_resampled, y_test, "Voting Ensemble"
)
display_model_metrics(voting_metrics)

# Add to metrics list for later comparison
top_metrics.append(voting_metrics)


In [None]:
# Create a final comparison DataFrame
final_models = [metrics['model_name'] for metrics in top_metrics]
final_df = pd.DataFrame({
    'Model': final_models,
    'Accuracy': [metrics['accuracy'] for metrics in top_metrics],
    'Precision': [metrics['precision'] for metrics in top_metrics],
    'Recall': [metrics['recall'] for metrics in top_metrics],
    'F1 Score': [metrics['f1'] for metrics in top_metrics],
    'ROC AUC': [metrics['roc_auc'] for metrics in top_metrics],
    'PR AUC': [metrics['pr_auc'] for metrics in top_metrics]
})

# Sort by F1 score
final_df = final_df.sort_values('F1 Score', ascending=False).reset_index(drop=True)

# Display the final comparison
print("Final Model Comparison:")
final_df


In [None]:
# Visualize final model comparison
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'PR AUC']
models = final_df['Model'].tolist()

# Create a grouped bar chart
plt.figure(figsize=(14, 8))
x = np.arange(len(models))
width = 0.15
multiplier = 0

for metric in metrics_to_plot:
    offset = width * multiplier
    plt.bar(x + offset, final_df[metric], width, label=metric)
    multiplier += 1

# Add labels and legend
plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Final Model Comparison')
plt.xticks(x + width * (len(metrics_to_plot) - 1) / 2, models, rotation=45, ha='right')
plt.legend(loc='lower right')
plt.ylim(0.7, 1.0)
plt.tight_layout()
plt.grid(axis='y', alpha=0.3)
plt.show()

# Identify the best model
best_model_idx = final_df['F1 Score'].idxmax()
best_model_name = final_df.loc[best_model_idx, 'Model']
best_model_f1 = final_df.loc[best_model_idx, 'F1 Score']

print(f"The best model is {best_model_name} with an F1 Score of {best_model_f1:.4f}")
