In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print('Libraries imported successfully')

## 1. Load Features Dataset

Load the engineered features from the previous notebook.

In [None]:
# Define path to features file
PROCESSED = os.path.abspath(os.path.join('..', 'data', 'processed'))
features_path = os.path.join(PROCESSED, 'features.csv')

# Check if file exists
if not os.path.exists(features_path):
    print(f'ERROR: features.csv not found at {features_path}')
    print('Please run notebook 03_feature_engineering.ipynb first to generate the features file.')
else:
    df = pd.read_csv(features_path)
    print(f'Features loaded successfully from {features_path}')
    print(f'Dataset shape: {df.shape}')

In [None]:
# Display first few rows
df.head()

In [None]:
# Check column names and data types
print('Columns in dataset:')
print(df.columns.tolist())
print(f'\nData types:')
print(df.dtypes)

In [None]:
# Check for missing values
print('Missing values per column:')
print(df.isnull().sum())

## 2. Define Features and Target

Separate the features (X) from the target label (y). The target is typically named 'is_laundering', 'is_fraud', or similar.

In [None]:
# Identify target column (common names for AML datasets)
possible_target_names = ['is_laundering', 'is_fraud', 'label', 'target', 'fraud', 'laundering']
target_col = None

for col in possible_target_names:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    print('ERROR: Could not find target column. Please specify manually.')
    print(f'Available columns: {df.columns.tolist()}')
else:
    print(f'Target column identified: {target_col}')
    
    # Check class distribution
    print(f'\nClass distribution:')
    print(df[target_col].value_counts())
    print(f'\nClass proportions:')
    print(df[target_col].value_counts(normalize=True))

In [None]:
# Define features and target
# Exclude target column and any ID columns
exclude_cols = [target_col, 'id', 'transaction_id', 'account_id', 'customer_id']
feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df[feature_cols]
y = df[target_col]

print(f'Features (X) shape: {X.shape}')
print(f'Target (y) shape: {y.shape}')
print(f'\nFeature columns ({len(feature_cols)}):')
print(feature_cols)

## 3. Stratified Train/Test Split

Split the data into training and testing sets using stratified sampling to maintain class proportions.

In [None]:
# Perform stratified train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f'Training set size: {X_train.shape[0]} samples')
print(f'Test set size: {X_test.shape[0]} samples')
print(f'\nTraining set class distribution:')
print(y_train.value_counts())
print(f'\nTest set class distribution:')
print(y_test.value_counts())

## 4. Baseline Model 1: Logistic Regression

Train a simple logistic regression model as our first baseline.

In [None]:
# Train Logistic Regression
print('Training Logistic Regression...')
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
print('Training complete.')

In [None]:
# Make predictions
y_pred_lr = lr_model.predict(X_test)

# Calculate evaluation metrics
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)

print('=' * 60)
print('LOGISTIC REGRESSION - EVALUATION METRICS')
print('=' * 60)
print(f'Accuracy:  {lr_accuracy:.4f}')
print(f'Precision: {lr_precision:.4f}')
print(f'Recall:    {lr_recall:.4f}')
print(f'F1 Score:  {lr_f1:.4f}')
print('=' * 60)

In [None]:
# Print detailed classification report
print('\nDetailed Classification Report:')
print(classification_report(y_test, y_pred_lr))

In [None]:
# Confusion Matrix
lr_cm = confusion_matrix(y_test, y_pred_lr)

plt.figure(figsize=(8, 6))
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix - Logistic Regression')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print(f'\nConfusion Matrix:')
print(f'True Negatives:  {lr_cm[0, 0]}')
print(f'False Positives: {lr_cm[0, 1]}')
print(f'False Negatives: {lr_cm[1, 0]}')
print(f'True Positives:  {lr_cm[1, 1]}')

## 5. Baseline Model 2: Decision Tree

Train a decision tree classifier as our second baseline model.

In [None]:
# Train Decision Tree
print('Training Decision Tree...')
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model.fit(X_train, y_train)
print('Training complete.')

In [None]:
# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Calculate evaluation metrics
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)

print('=' * 60)
print('DECISION TREE - EVALUATION METRICS')
print('=' * 60)
print(f'Accuracy:  {dt_accuracy:.4f}')
print(f'Precision: {dt_precision:.4f}')
print(f'Recall:    {dt_recall:.4f}')
print(f'F1 Score:  {dt_f1:.4f}')
print('=' * 60)

In [None]:
# Print detailed classification report
print('\nDetailed Classification Report:')
print(classification_report(y_test, y_pred_dt))

In [None]:
# Confusion Matrix
dt_cm = confusion_matrix(y_test, y_pred_dt)

plt.figure(figsize=(8, 6))
sns.heatmap(dt_cm, annot=True, fmt='d', cmap='Greens', cbar=False)
plt.title('Confusion Matrix - Decision Tree')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print(f'\nConfusion Matrix:')
print(f'True Negatives:  {dt_cm[0, 0]}')
print(f'False Positives: {dt_cm[0, 1]}')
print(f'False Negatives: {dt_cm[1, 0]}')
print(f'True Positives:  {dt_cm[1, 1]}')

## 6. Compare Baseline Models

Quick comparison of both baseline models.

In [None]:
# Create comparison dataframe
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree'],
    'Accuracy': [lr_accuracy, dt_accuracy],
    'Precision': [lr_precision, dt_precision],
    'Recall': [lr_recall, dt_recall],
    'F1 Score': [lr_f1, dt_f1]
})

print('\n' + '=' * 80)
print('BASELINE MODELS COMPARISON')
print('=' * 80)
print(comparison.to_string(index=False))
print('=' * 80)

In [None]:
# Visualize comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x - width/2, [lr_accuracy, lr_precision, lr_recall, lr_f1], width, label='Logistic Regression', alpha=0.8)
ax.bar(x + width/2, [dt_accuracy, dt_precision, dt_recall, dt_f1], width, label='Decision Tree', alpha=0.8)

ax.set_ylabel('Score')
ax.set_title('Baseline Models Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim([0, 1])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Summary

We have successfully built and evaluated two baseline models:

1. **Logistic Regression**: Simple linear model that provides a good starting point
2. **Decision Tree**: Non-linear model that can capture more complex patterns

**Key Observations:**
- These baseline models give us a performance benchmark to improve upon
- In AML detection, **recall** is often more important than precision (we want to catch as many fraudulent transactions as possible)
- The class imbalance may be affecting model performance

**Next Steps:**
- Handle class imbalance using techniques like class weighting or resampling
- Try more advanced models (Random Forest, XGBoost)
- Tune decision thresholds to optimize for recall