In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

# Try to import XGBoost
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
    print('XGBoost is available')
except ImportError:
    XGBOOST_AVAILABLE = False
    print('XGBoost not installed. Will skip XGBoost model.')
    print('To install: pip install xgboost')

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print('Libraries imported successfully')

## 1. Load Data and Prepare Train/Test Split

Load features and create the same train/test split as previous notebooks.

In [None]:
# Load features
PROCESSED = os.path.abspath(os.path.join('..', 'data', 'processed'))
features_path = os.path.join(PROCESSED, 'features.csv')

if not os.path.exists(features_path):
    print(f'ERROR: features.csv not found at {features_path}')
else:
    df = pd.read_csv(features_path)
    print(f'Features loaded successfully')
    print(f'Dataset shape: {df.shape}')

In [None]:
# Identify target column
possible_target_names = ['is_laundering', 'is_fraud', 'label', 'target', 'fraud', 'laundering']
target_col = None

for col in possible_target_names:
    if col in df.columns:
        target_col = col
        break

print(f'Target column: {target_col}')
print(f'\nClass distribution:')
print(df[target_col].value_counts())

In [None]:
# Define features and target
exclude_cols = [target_col, 'id', 'transaction_id', 'account_id', 'customer_id']
feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df[feature_cols]
y = df[target_col]

print(f'Features shape: {X.shape}')
print(f'Target shape: {y.shape}')

In [None]:
# Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f'Training set: {X_train.shape[0]} samples')
print(f'Test set: {X_test.shape[0]} samples')

## 2. Random Forest Model

Random Forest is an ensemble of decision trees that reduces overfitting and improves accuracy.

In [None]:
# Train Random Forest with class balancing
print('Training Random Forest...')
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
rf_model.fit(X_train, y_train)
print('Training complete.')

In [None]:
# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Calculate metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_roc_auc = roc_auc_score(y_test, y_pred_proba_rf)

print('=' * 60)
print('RANDOM FOREST - EVALUATION METRICS')
print('=' * 60)
print(f'Accuracy:  {rf_accuracy:.4f}')
print(f'Precision: {rf_precision:.4f}')
print(f'Recall:    {rf_recall:.4f}')
print(f'F1 Score:  {rf_f1:.4f}')
print(f'ROC-AUC:   {rf_roc_auc:.4f}')
print('=' * 60)

In [None]:
# Classification report
print('\nDetailed Classification Report:')
print(classification_report(y_test, y_pred_rf))

In [None]:
# Confusion Matrix
rf_cm = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(8, 6))
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - Random Forest')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print(f'\nConfusion Matrix:')
print(f'True Negatives:  {rf_cm[0, 0]}')
print(f'False Positives: {rf_cm[0, 1]}')
print(f'False Negatives: {rf_cm[1, 0]}')
print(f'True Positives:  {rf_cm[1, 1]}')

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print('\nTop 10 Most Important Features:')
print(feature_importance.head(10).to_string(index=False))

# Plot top features
plt.figure(figsize=(10, 6))
top_features = feature_importance.head(10)
plt.barh(top_features['Feature'], top_features['Importance'])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importances - Random Forest')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 3. XGBoost Model

XGBoost is a powerful gradient boosting algorithm. We'll train it if available.

In [None]:
if XGBOOST_AVAILABLE:
    # Calculate scale_pos_weight for class imbalance
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    print(f'Scale pos weight: {scale_pos_weight:.2f}')
    
    # Train XGBoost
    print('\nTraining XGBoost...')
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        eval_metric='logloss'
    )
    xgb_model.fit(X_train, y_train)
    print('Training complete.')
else:
    print('XGBoost is not available. Skipping this model.')

In [None]:
if XGBOOST_AVAILABLE:
    # Make predictions
    y_pred_xgb = xgb_model.predict(X_test)
    y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
    xgb_precision = precision_score(y_test, y_pred_xgb)
    xgb_recall = recall_score(y_test, y_pred_xgb)
    xgb_f1 = f1_score(y_test, y_pred_xgb)
    xgb_roc_auc = roc_auc_score(y_test, y_pred_proba_xgb)
    
    print('=' * 60)
    print('XGBOOST - EVALUATION METRICS')
    print('=' * 60)
    print(f'Accuracy:  {xgb_accuracy:.4f}')
    print(f'Precision: {xgb_precision:.4f}')
    print(f'Recall:    {xgb_recall:.4f}')
    print(f'F1 Score:  {xgb_f1:.4f}')
    print(f'ROC-AUC:   {xgb_roc_auc:.4f}')
    print('=' * 60)
else:
    print('XGBoost is not available.')

In [None]:
if XGBOOST_AVAILABLE:
    # Classification report
    print('\nDetailed Classification Report:')
    print(classification_report(y_test, y_pred_xgb))
else:
    print('XGBoost is not available.')

In [None]:
if XGBOOST_AVAILABLE:
    # Confusion Matrix
    xgb_cm = confusion_matrix(y_test, y_pred_xgb)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(xgb_cm, annot=True, fmt='d', cmap='Greens', cbar=False)
    plt.title('Confusion Matrix - XGBoost')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    print(f'\nConfusion Matrix:')
    print(f'True Negatives:  {xgb_cm[0, 0]}')
    print(f'False Positives: {xgb_cm[0, 1]}')
    print(f'False Negatives: {xgb_cm[1, 0]}')
    print(f'True Positives:  {xgb_cm[1, 1]}')
else:
    print('XGBoost is not available.')

In [None]:
if XGBOOST_AVAILABLE:
    # Feature importance for XGBoost
    xgb_feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': xgb_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print('\nTop 10 Most Important Features (XGBoost):')
    print(xgb_feature_importance.head(10).to_string(index=False))
    
    # Plot top features
    plt.figure(figsize=(10, 6))
    top_features_xgb = xgb_feature_importance.head(10)
    plt.barh(top_features_xgb['Feature'], top_features_xgb['Importance'])
    plt.xlabel('Importance')
    plt.title('Top 10 Feature Importances - XGBoost')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print('XGBoost is not available.')

## 4. ROC Curves

Plot ROC curves to visualize the trade-off between true positive rate and false positive rate.

In [None]:
# Calculate ROC curves
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)

plt.figure(figsize=(10, 7))
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {rf_roc_auc:.4f})', linewidth=2)

if XGBOOST_AVAILABLE:
    fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_proba_xgb)
    plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {xgb_roc_auc:.4f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Advanced Models')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 5. Compare Advanced Models

Side-by-side comparison of Random Forest and XGBoost.

In [None]:
# Create comparison dataframe
if XGBOOST_AVAILABLE:
    comparison = pd.DataFrame({
        'Model': ['Random Forest', 'XGBoost'],
        'Accuracy': [rf_accuracy, xgb_accuracy],
        'Precision': [rf_precision, xgb_precision],
        'Recall': [rf_recall, xgb_recall],
        'F1 Score': [rf_f1, xgb_f1],
        'ROC-AUC': [rf_roc_auc, xgb_roc_auc]
    })
else:
    comparison = pd.DataFrame({
        'Model': ['Random Forest'],
        'Accuracy': [rf_accuracy],
        'Precision': [rf_precision],
        'Recall': [rf_recall],
        'F1 Score': [rf_f1],
        'ROC-AUC': [rf_roc_auc]
    })

print('\n' + '=' * 85)
print('ADVANCED MODELS COMPARISON')
print('=' * 85)
print(comparison.to_string(index=False))
print('=' * 85)

In [None]:
# Visualize comparison
if XGBOOST_AVAILABLE:
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC']
    rf_scores = [rf_accuracy, rf_precision, rf_recall, rf_f1, rf_roc_auc]
    xgb_scores = [xgb_accuracy, xgb_precision, xgb_recall, xgb_f1, xgb_roc_auc]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.bar(x - width/2, rf_scores, width, label='Random Forest', alpha=0.8)
    ax.bar(x + width/2, xgb_scores, width, label='XGBoost', alpha=0.8)
    
    ax.set_ylabel('Score')
    ax.set_title('Advanced Models Performance Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics)
    ax.legend()
    ax.set_ylim([0, 1])
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print('Only Random Forest is available for comparison.')
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC']
    rf_scores = [rf_accuracy, rf_precision, rf_recall, rf_f1, rf_roc_auc]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.bar(metrics, rf_scores, alpha=0.8, color='steelblue')
    ax.set_ylabel('Score')
    ax.set_title('Random Forest Performance')
    ax.set_ylim([0, 1])
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Summary

### Advanced Models Performance:

**Random Forest:**
- Ensemble of decision trees that reduces overfitting
- Uses class weighting to handle imbalance
- Provides feature importance rankings
- Generally more robust than single decision trees

**XGBoost** (if available):
- Gradient boosting algorithm - builds trees sequentially
- Often achieves state-of-the-art performance
- Uses `scale_pos_weight` to handle class imbalance
- More computationally intensive but often more accurate

### Key Observations:

1. **ROC-AUC** provides a threshold-independent measure of model performance
2. **Feature importance** helps understand which features drive predictions
3. Both models typically outperform simple baselines (Logistic Regression, Decision Tree)
4. For AML detection, focus on **recall** - catching fraud cases is critical

### Next Steps:
- Optimize decision thresholds to improve recall
- Compare all models (baseline + advanced) in one place
- Select the best model for final deployment