In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print('Libraries imported successfully')

## 1. Load Data and Prepare Train/Test Split

Load the features and create the same train/test split as before.

In [None]:
# Load features
PROCESSED = os.path.abspath(os.path.join('..', 'data', 'processed'))
features_path = os.path.join(PROCESSED, 'features.csv')

if not os.path.exists(features_path):
    print(f'ERROR: features.csv not found at {features_path}')
else:
    df = pd.read_csv(features_path)
    print(f'Features loaded successfully')
    print(f'Dataset shape: {df.shape}')

In [None]:
# Identify target column
possible_target_names = ['is_laundering', 'is_fraud', 'label', 'target', 'fraud', 'laundering']
target_col = None

for col in possible_target_names:
    if col in df.columns:
        target_col = col
        break

print(f'Target column: {target_col}')

# Check class distribution
print(f'\nOriginal class distribution:')
print(df[target_col].value_counts())
print(f'\nClass proportions:')
print(df[target_col].value_counts(normalize=True))

In [None]:
# Define features and target
exclude_cols = [target_col, 'id', 'transaction_id', 'account_id', 'customer_id']
feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df[feature_cols]
y = df[target_col]

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')

In [None]:
# Stratified train/test split (same as baseline)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f'Training set: {X_train.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')
print(f'\nTraining set class distribution:')
print(y_train.value_counts())

## 2. Baseline (No Imbalance Handling)

First, let's train a standard Logistic Regression as our baseline for comparison.

In [None]:
# Train baseline Logistic Regression
print('Training baseline Logistic Regression (no imbalance handling)...')
baseline_model = LogisticRegression(random_state=42, max_iter=1000)
baseline_model.fit(X_train, y_train)

# Predictions
y_pred_baseline = baseline_model.predict(X_test)

# Metrics
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
baseline_precision = precision_score(y_test, y_pred_baseline)
baseline_recall = recall_score(y_test, y_pred_baseline)
baseline_f1 = f1_score(y_test, y_pred_baseline)

print('\n' + '=' * 60)
print('BASELINE (No Imbalance Handling)')
print('=' * 60)
print(f'Accuracy:  {baseline_accuracy:.4f}')
print(f'Precision: {baseline_precision:.4f}')
print(f'Recall:    {baseline_recall:.4f}')
print(f'F1 Score:  {baseline_f1:.4f}')
print('=' * 60)

## 3. Technique 1: Class Weighting (Balanced)

Use `class_weight='balanced'` to automatically adjust weights inversely proportional to class frequencies.

In [None]:
# Train Logistic Regression with balanced class weights
print('Training Logistic Regression with class_weight="balanced"...')
balanced_model = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
balanced_model.fit(X_train, y_train)

# Predictions
y_pred_balanced = balanced_model.predict(X_test)

# Metrics
balanced_accuracy = accuracy_score(y_test, y_pred_balanced)
balanced_precision = precision_score(y_test, y_pred_balanced)
balanced_recall = recall_score(y_test, y_pred_balanced)
balanced_f1 = f1_score(y_test, y_pred_balanced)

print('\n' + '=' * 60)
print('CLASS WEIGHTING (Balanced)')
print('=' * 60)
print(f'Accuracy:  {balanced_accuracy:.4f}')
print(f'Precision: {balanced_precision:.4f}')
print(f'Recall:    {balanced_recall:.4f}')
print(f'F1 Score:  {balanced_f1:.4f}')
print('=' * 60)

In [None]:
# Confusion matrix for balanced model
balanced_cm = confusion_matrix(y_test, y_pred_balanced)

plt.figure(figsize=(8, 6))
sns.heatmap(balanced_cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - Class Weighting (Balanced)')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print(f'True Negatives:  {balanced_cm[0, 0]}')
print(f'False Positives: {balanced_cm[0, 1]}')
print(f'False Negatives: {balanced_cm[1, 0]}')
print(f'True Positives:  {balanced_cm[1, 1]}')

## 4. Technique 2: Random Under-Sampling

Reduce the majority class by randomly removing samples to match the minority class size.

In [None]:
# Apply Random Under-Sampling
print('Applying Random Under-Sampling...')
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

print(f'Original training set: {X_train.shape[0]} samples')
print(f'After under-sampling: {X_train_rus.shape[0]} samples')
print(f'\nClass distribution after under-sampling:')
print(y_train_rus.value_counts())

In [None]:
# Train Logistic Regression on under-sampled data
print('Training Logistic Regression on under-sampled data...')
rus_model = LogisticRegression(random_state=42, max_iter=1000)
rus_model.fit(X_train_rus, y_train_rus)

# Predictions on original test set
y_pred_rus = rus_model.predict(X_test)

# Metrics
rus_accuracy = accuracy_score(y_test, y_pred_rus)
rus_precision = precision_score(y_test, y_pred_rus)
rus_recall = recall_score(y_test, y_pred_rus)
rus_f1 = f1_score(y_test, y_pred_rus)

print('\n' + '=' * 60)
print('RANDOM UNDER-SAMPLING')
print('=' * 60)
print(f'Accuracy:  {rus_accuracy:.4f}')
print(f'Precision: {rus_precision:.4f}')
print(f'Recall:    {rus_recall:.4f}')
print(f'F1 Score:  {rus_f1:.4f}')
print('=' * 60)

In [None]:
# Confusion matrix for under-sampling
rus_cm = confusion_matrix(y_test, y_pred_rus)

plt.figure(figsize=(8, 6))
sns.heatmap(rus_cm, annot=True, fmt='d', cmap='Greens', cbar=False)
plt.title('Confusion Matrix - Random Under-Sampling')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print(f'True Negatives:  {rus_cm[0, 0]}')
print(f'False Positives: {rus_cm[0, 1]}')
print(f'False Negatives: {rus_cm[1, 0]}')
print(f'True Positives:  {rus_cm[1, 1]}')

## 5. Technique 3: Random Over-Sampling

Increase the minority class by randomly duplicating samples to match the majority class size.

In [None]:
# Apply Random Over-Sampling
print('Applying Random Over-Sampling...')
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

print(f'Original training set: {X_train.shape[0]} samples')
print(f'After over-sampling: {X_train_ros.shape[0]} samples')
print(f'\nClass distribution after over-sampling:')
print(y_train_ros.value_counts())

In [None]:
# Train Logistic Regression on over-sampled data
print('Training Logistic Regression on over-sampled data...')
ros_model = LogisticRegression(random_state=42, max_iter=1000)
ros_model.fit(X_train_ros, y_train_ros)

# Predictions on original test set
y_pred_ros = ros_model.predict(X_test)

# Metrics
ros_accuracy = accuracy_score(y_test, y_pred_ros)
ros_precision = precision_score(y_test, y_pred_ros)
ros_recall = recall_score(y_test, y_pred_ros)
ros_f1 = f1_score(y_test, y_pred_ros)

print('\n' + '=' * 60)
print('RANDOM OVER-SAMPLING')
print('=' * 60)
print(f'Accuracy:  {ros_accuracy:.4f}')
print(f'Precision: {ros_precision:.4f}')
print(f'Recall:    {ros_recall:.4f}')
print(f'F1 Score:  {ros_f1:.4f}')
print('=' * 60)

In [None]:
# Confusion matrix for over-sampling
ros_cm = confusion_matrix(y_test, y_pred_ros)

plt.figure(figsize=(8, 6))
sns.heatmap(ros_cm, annot=True, fmt='d', cmap='Oranges', cbar=False)
plt.title('Confusion Matrix - Random Over-Sampling')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print(f'True Negatives:  {ros_cm[0, 0]}')
print(f'False Positives: {ros_cm[0, 1]}')
print(f'False Negatives: {ros_cm[1, 0]}')
print(f'True Positives:  {ros_cm[1, 1]}')

## 6. Compare All Imbalance Handling Techniques

Side-by-side comparison of all approaches.

In [None]:
# Create comparison dataframe
comparison = pd.DataFrame({
    'Technique': ['Baseline (No Handling)', 'Class Weighting', 'Under-Sampling', 'Over-Sampling'],
    'Accuracy': [baseline_accuracy, balanced_accuracy, rus_accuracy, ros_accuracy],
    'Precision': [baseline_precision, balanced_precision, rus_precision, ros_precision],
    'Recall': [baseline_recall, balanced_recall, rus_recall, ros_recall],
    'F1 Score': [baseline_f1, balanced_f1, rus_f1, ros_f1]
})

print('\n' + '=' * 90)
print('COMPARISON OF IMBALANCE HANDLING TECHNIQUES')
print('=' * 90)
print(comparison.to_string(index=False))
print('=' * 90)

In [None]:
# Visualize comparison - Focus on Recall and F1
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

techniques = ['Baseline', 'Balanced', 'Under-Sample', 'Over-Sample']
x_pos = np.arange(len(techniques))

# Recall comparison
recalls = [baseline_recall, balanced_recall, rus_recall, ros_recall]
axes[0].bar(x_pos, recalls, color=['gray', 'blue', 'green', 'orange'], alpha=0.7)
axes[0].set_ylabel('Recall')
axes[0].set_title('Recall Comparison (Higher is Better for AML)')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(techniques, rotation=15, ha='right')
axes[0].set_ylim([0, 1])
axes[0].grid(axis='y', alpha=0.3)

# F1 comparison
f1_scores = [baseline_f1, balanced_f1, rus_f1, ros_f1]
axes[1].bar(x_pos, f1_scores, color=['gray', 'blue', 'green', 'orange'], alpha=0.7)
axes[1].set_ylabel('F1 Score')
axes[1].set_title('F1 Score Comparison')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(techniques, rotation=15, ha='right')
axes[1].set_ylim([0, 1])
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# All metrics comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(metrics))
width = 0.2

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(x - 1.5*width, [baseline_accuracy, baseline_precision, baseline_recall, baseline_f1], 
       width, label='Baseline', alpha=0.8)
ax.bar(x - 0.5*width, [balanced_accuracy, balanced_precision, balanced_recall, balanced_f1], 
       width, label='Class Weighting', alpha=0.8)
ax.bar(x + 0.5*width, [rus_accuracy, rus_precision, rus_recall, rus_f1], 
       width, label='Under-Sampling', alpha=0.8)
ax.bar(x + 1.5*width, [ros_accuracy, ros_precision, ros_recall, ros_f1], 
       width, label='Over-Sampling', alpha=0.8)

ax.set_ylabel('Score')
ax.set_title('All Metrics Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim([0, 1])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Summary and Recommendations

### Key Findings:

**1. Baseline (No Imbalance Handling):**
- May have high accuracy but likely poor recall for the minority class (fraud cases)
- Not suitable for AML detection where missing fraud is costly

**2. Class Weighting (Balanced):**
- Simple to implement - just add `class_weight='balanced'` parameter
- Usually improves recall significantly
- No change to training data size
- Good first approach for imbalanced problems

**3. Random Under-Sampling:**
- Reduces training data size significantly
- May lose important information from majority class
- Often improves recall but can reduce precision
- Fast training due to smaller dataset

**4. Random Over-Sampling:**
- Increases training data size by duplicating minority samples
- Risk of overfitting to minority class
- Usually improves recall
- Slower training due to larger dataset

### Which Method Helped Most?

For **AML detection**, we prioritize **recall** (catching as many fraud cases as possible) over accuracy.

Based on the results:
- **Class weighting** is often the best starting point - simple and effective
- **Over-sampling** can provide better recall if you have sufficient compute resources
- **Under-sampling** works well when you have a very large dataset

### Recommendation:

Use **class weighting** or **over-sampling** for the next steps with advanced models. These techniques balance the trade-off between catching fraud cases and maintaining reasonable precision.