# Complete ML Pipeline - Fraud Detection

**Phases:**
1. Data Preparation (72 features)
2. Base Model Training (5 models)
3. Hyperparameter Tuning
4. Model Evaluation
5. Threshold Optimization
6. Model Persistence
7. Visualization

---

## Phase 1: Data Preparation

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

print("Libraries imported successfully")

In [None]:
# Load dataset
data = pd.read_csv('../../data/reduced_df.csv')
print(f"Dataset loaded: {data.shape}")

# Check nulls and duplicates
print(f"Null values: {data.isnull().sum().sum()}")
print(f"Duplicates: {data.duplicated().sum()}")

# Remove duplicates if any
data = data.drop_duplicates()
print(f"Final dataset: {data.shape}")

In [None]:
# Separate features and target
X = data.drop('isFraud', axis=1)
y = data['isFraud']

print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}")
print(f"Fraud rate: {y.mean()*100:.2f}%")

In [None]:
# Train-test split (80-20 stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape[0]} samples")
print(f"Test: {X_test.shape[0]} samples")
print(f"Train fraud rate: {y_train.mean()*100:.2f}%")
print(f"Test fraud rate: {y_test.mean()*100:.2f}%")

In [None]:
# Scaling for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Scaling completed")

In [None]:
# SMOTE for Random Forest
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f"SMOTE samples: {X_train_smote.shape[0]}")
print(f"SMOTE fraud rate: {y_train_smote.mean()*100:.2f}%")

## Phase 2: Base Model Training

In [None]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, 
    recall_score, confusion_matrix, classification_report
)

print("Model libraries imported")

In [None]:
def evaluate_model(name, y_true, y_pred, y_pred_proba):
    results = {
        'Model': name,
        'ROC-AUC': roc_auc_score(y_true, y_pred_proba),
        'F1-Score': f1_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0)
    }
    print(f"\n{name}")
    print(f"ROC-AUC: {results['ROC-AUC']:.4f}")
    print(f"F1-Score: {results['F1-Score']:.4f}")
    print(f"Precision: {results['Precision']:.4f}")
    print(f"Recall: {results['Recall']:.4f}")
    return results

In [None]:
# Calculate scale_pos_weight for XGBoost
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.2f}")

In [None]:
baseline_results = []

# XGBoost
print("Training XGBoost...")
xgb_base = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss')
xgb_base.fit(X_train, y_train)
y_pred_xgb = xgb_base.predict(X_test)
y_pred_proba_xgb = xgb_base.predict_proba(X_test)[:, 1]
baseline_results.append(evaluate_model('XGBoost', y_test, y_pred_xgb, y_pred_proba_xgb))

In [None]:
# CatBoost
print("Training CatBoost...")
cat_base = CatBoostClassifier(auto_class_weights='Balanced', random_state=42, verbose=False)
cat_base.fit(X_train, y_train)
y_pred_cat = cat_base.predict(X_test)
y_pred_proba_cat = cat_base.predict_proba(X_test)[:, 1]
baseline_results.append(evaluate_model('CatBoost', y_test, y_pred_cat, y_pred_proba_cat))

In [None]:
# LightGBM
print("Training LightGBM...")
lgbm_base = LGBMClassifier(class_weight='balanced', random_state=42, verbose=-1)
lgbm_base.fit(X_train, y_train)
y_pred_lgbm = lgbm_base.predict(X_test)
y_pred_proba_lgbm = lgbm_base.predict_proba(X_test)[:, 1]
baseline_results.append(evaluate_model('LightGBM', y_test, y_pred_lgbm, y_pred_proba_lgbm))

In [None]:
# Random Forest with SMOTE
print("Training Random Forest...")
rf_base = RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
rf_base.fit(X_train_smote, y_train_smote)
y_pred_rf = rf_base.predict(X_test)
y_pred_proba_rf = rf_base.predict_proba(X_test)[:, 1]
baseline_results.append(evaluate_model('RandomForest', y_test, y_pred_rf, y_pred_proba_rf))

In [None]:
# Logistic Regression
print("Training Logistic Regression...")
lr_base = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr_base.fit(X_train_scaled, y_train)
y_pred_lr = lr_base.predict(X_test_scaled)
y_pred_proba_lr = lr_base.predict_proba(X_test_scaled)[:, 1]
baseline_results.append(evaluate_model('LogisticRegression', y_test, y_pred_lr, y_pred_proba_lr))

In [None]:
baseline_df = pd.DataFrame(baseline_results)
print("\nBaseline Results:")
print(baseline_df.to_string(index=False))

## Phase 3: Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
print("Search libraries imported")

In [None]:
# XGBoost tuning
print("Tuning XGBoost...")
xgb_params = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [200, 400, 600],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_search = RandomizedSearchCV(
    XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42, eval_metric='logloss'),
    xgb_params,
    n_iter=20,
    scoring='f1',
    cv=3,
    random_state=42,
    n_jobs=-1
)
xgb_search.fit(X_train, y_train)
print(f"Best XGBoost params: {xgb_search.best_params_}")
xgb_tuned = xgb_search.best_estimator_

In [None]:
# CatBoost tuning
print("Tuning CatBoost...")
cat_params = {
    'depth': [4, 6, 8, 10],
    'iterations': [500, 1000],
    'learning_rate': [0.03, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

cat_search = RandomizedSearchCV(
    CatBoostClassifier(auto_class_weights='Balanced', random_state=42, verbose=False),
    cat_params,
    n_iter=15,
    scoring='f1',
    cv=3,
    random_state=42,
    n_jobs=-1
)
cat_search.fit(X_train, y_train)
print(f"Best CatBoost params: {cat_search.best_params_}")
cat_tuned = cat_search.best_estimator_

In [None]:
# LightGBM tuning
print("Tuning LightGBM...")
lgbm_params = {
    'num_leaves': [31, 63],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [200, 400],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9]
}

lgbm_search = RandomizedSearchCV(
    LGBMClassifier(class_weight='balanced', random_state=42, verbose=-1),
    lgbm_params,
    n_iter=15,
    scoring='f1',
    cv=3,
    random_state=42,
    n_jobs=-1
)
lgbm_search.fit(X_train, y_train)
print(f"Best LightGBM params: {lgbm_search.best_params_}")
lgbm_tuned = lgbm_search.best_estimator_

In [None]:
# Random Forest tuning
print("Tuning Random Forest...")
rf_params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf_search = RandomizedSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1),
    rf_params,
    n_iter=15,
    scoring='f1',
    cv=3,
    random_state=42,
    n_jobs=-1
)
rf_search.fit(X_train_smote, y_train_smote)
print(f"Best RF params: {rf_search.best_params_}")
rf_tuned = rf_search.best_estimator_

In [None]:
# Logistic Regression tuning
print("Tuning Logistic Regression...")
lr_params = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}

lr_search = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    lr_params,
    scoring='f1',
    cv=3,
    n_jobs=-1
)
lr_search.fit(X_train_scaled, y_train)
print(f"Best LR params: {lr_search.best_params_}")
lr_tuned = lr_search.best_estimator_

## Phase 4: Post-Tuning Evaluation

In [None]:
tuned_results = []

# Evaluate tuned models
y_pred_xgb_t = xgb_tuned.predict(X_test)
y_pred_proba_xgb_t = xgb_tuned.predict_proba(X_test)[:, 1]
tuned_results.append(evaluate_model('XGBoost_Tuned', y_test, y_pred_xgb_t, y_pred_proba_xgb_t))

y_pred_cat_t = cat_tuned.predict(X_test)
y_pred_proba_cat_t = cat_tuned.predict_proba(X_test)[:, 1]
tuned_results.append(evaluate_model('CatBoost_Tuned', y_test, y_pred_cat_t, y_pred_proba_cat_t))

y_pred_lgbm_t = lgbm_tuned.predict(X_test)
y_pred_proba_lgbm_t = lgbm_tuned.predict_proba(X_test)[:, 1]
tuned_results.append(evaluate_model('LightGBM_Tuned', y_test, y_pred_lgbm_t, y_pred_proba_lgbm_t))

y_pred_rf_t = rf_tuned.predict(X_test)
y_pred_proba_rf_t = rf_tuned.predict_proba(X_test)[:, 1]
tuned_results.append(evaluate_model('RandomForest_Tuned', y_test, y_pred_rf_t, y_pred_proba_rf_t))

y_pred_lr_t = lr_tuned.predict(X_test_scaled)
y_pred_proba_lr_t = lr_tuned.predict_proba(X_test_scaled)[:, 1]
tuned_results.append(evaluate_model('LogisticRegression_Tuned', y_test, y_pred_lr_t, y_pred_proba_lr_t))

In [None]:
tuned_df = pd.DataFrame(tuned_results)
print("\nTuned Results:")
print(tuned_df.to_string(index=False))

In [None]:
# Compare baseline vs tuned
comparison = pd.concat([baseline_df, tuned_df], ignore_index=True)
comparison.to_csv('results/baseline_vs_tuned.csv', index=False)
print("\nComparison saved to results/baseline_vs_tuned.csv")

## Phase 5: Threshold Optimization

In [None]:
from sklearn.metrics import precision_recall_curve

def find_best_threshold(y_true, y_pred_proba):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    best_f1 = f1_scores[best_idx]
    return best_threshold, best_f1

In [None]:
thresholds = {}

# Find optimal thresholds for each model
models_proba = [
    ('XGBoost', y_pred_proba_xgb_t),
    ('CatBoost', y_pred_proba_cat_t),
    ('LightGBM', y_pred_proba_lgbm_t),
    ('RandomForest', y_pred_proba_rf_t),
    ('LogisticRegression', y_pred_proba_lr_t)
]

for name, proba in models_proba:
    best_thresh, best_f1 = find_best_threshold(y_test, proba)
    thresholds[name] = {'threshold': float(best_thresh), 'f1_score': float(best_f1)}
    print(f"{name}: Threshold={best_thresh:.3f}, F1={best_f1:.4f}")

## Phase 6: Save Final Models

In [None]:
import pickle
import json
from datetime import datetime

# Save models
models = {
    'xgboost': xgb_tuned,
    'catboost': cat_tuned,
    'lightgbm': lgbm_tuned,
    'randomforest': rf_tuned,
    'logistic_regression': lr_tuned
}

for name, model in models.items():
    with open(f'new_models/{name}_model.pkl', 'wb') as f:
        pickle.dump(model, f)
    print(f"Saved {name}_model.pkl")

# Save scaler
with open('new_models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("Saved scaler.pkl")

In [None]:
# Save metadata
metadata = {
    'date': datetime.now().isoformat(),
    'n_features': X.shape[1],
    'n_train_samples': X_train.shape[0],
    'n_test_samples': X_test.shape[0],
    'fraud_rate': float(y.mean()),
    'thresholds': thresholds,
    'best_params': {
        'xgboost': xgb_search.best_params_,
        'catboost': cat_search.best_params_,
        'lightgbm': lgbm_search.best_params_,
        'randomforest': rf_search.best_params_,
        'logistic_regression': lr_search.best_params_
    }
}

with open('new_models/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("Saved metadata.json")

In [None]:
# Save feature list
feature_list = X.columns.tolist()
with open('new_models/features.json', 'w') as f:
    json.dump({'features': feature_list}, f, indent=2)
print(f"Saved features.json ({len(feature_list)} features)")

## Phase 7: Visualization and Reporting

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve
sns.set_style('whitegrid')

In [None]:
# Feature importance for XGBoost
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_tuned.feature_importances_
}).sort_values('importance', ascending=False).head(20)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Top 20 Features - XGBoost')
plt.tight_layout()
plt.savefig('visualizations/feature_importance.png', dpi=300)
plt.close()
print("Saved feature_importance.png")

In [None]:
# Confusion matrices
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
models_pred = [
    ('XGBoost', y_pred_xgb_t),
    ('CatBoost', y_pred_cat_t),
    ('LightGBM', y_pred_lgbm_t),
    ('RandomForest', y_pred_rf_t),
    ('LogisticRegression', y_pred_lr_t)
]

for idx, (name, pred) in enumerate(models_pred):
    ax = axes[idx // 3, idx % 3]
    cm = confusion_matrix(y_test, pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(name)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

axes[1, 2].axis('off')
plt.tight_layout()
plt.savefig('visualizations/confusion_matrices.png', dpi=300)
plt.close()
print("Saved confusion_matrices.png")

In [None]:
# Model comparison bar chart
comparison_plot = tuned_df.set_index('Model')
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

comparison_plot['F1-Score'].plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('F1-Score Comparison')
axes[0].set_ylabel('F1-Score')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')

comparison_plot['Recall'].plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Recall Comparison')
axes[1].set_ylabel('Recall')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')

comparison_plot['ROC-AUC'].plot(kind='bar', ax=axes[2], color='coral')
axes[2].set_title('ROC-AUC Comparison')
axes[2].set_ylabel('ROC-AUC')
axes[2].set_xticklabels(axes[2].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.savefig('visualizations/model_comparison.png', dpi=300)
plt.close()
print("Saved model_comparison.png")

In [None]:
# ROC curves
plt.figure(figsize=(10, 8))

for name, proba in models_proba:
    fpr, tpr, _ = roc_curve(y_test, proba)
    auc = roc_auc_score(y_test, proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - All Models')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/roc_curves.png', dpi=300)
plt.close()
print("Saved roc_curves.png")

In [None]:
print("\n" + "="*50)
print("PIPELINE COMPLETE")
print("="*50)
print("\nSaved:")
print("- 5 trained models (new_models/)")
print("- Scaler (new_models/)")
print("- Metadata with thresholds (new_models/)")
print("- Feature list (new_models/)")
print("- Results CSV (results/)")
print("- 4 visualizations (visualizations/)")
print("\nReady for deployment!")