In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

# Assume scale_pos_weight is already calculated and available from Step 14
# We will define the preprocessor again for completeness in the pipeline definition
scale_pos_weight = 777.64 

# --- 1. Define Preprocessing Pipeline (same as before) ---
numeric_features = [
    "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest",
    "balanceDiffOrig", "balanceDiffDest", "is_merchant", "Orig_Count_1step"
]
categorical_features = ["type"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ],
    remainder="drop"
)

# --- 2. Define Base Model Components ---

# A. XGBoost (Gradient Boosting - Non-linear)
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False,
    scale_pos_weight=scale_pos_weight
)

# B. Random Forest (Bagging Ensemble - Non-linear, lower variance)
rf_clf = RandomForestClassifier(
    random_state=42,
    # Use class_weight='balanced' equivalent to scale_pos_weight for RF
    class_weight='balanced' 
)

# C. Logistic Regression (Linear Model - Fast, gives a linear view)
lr_clf = LogisticRegression(
    random_state=42,
    class_weight='balanced',
    solver='saga',
    max_iter=500 
)


print("✅ Ensemble model components defined and configured with imbalance weights.")

✅ Ensemble model components defined and configured with imbalance weights.


### Hyperparameter Tuning (XGBoost)

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# --- 1. Load the Sampled Data and Consolidate Feature Engineering ---

# Load the Sampled Data (Ensure this file is in the same directory as your notebook)
# Assuming 'AIML_Sample_10Pct.csv' is available
try:
    df = pd.read_csv("E:\FraudPulse\Data\AIML_Sample_10Pct.csv") 
except FileNotFoundError:
    print("ERROR: AIML_Sample_10Pct.csv not found. Please check file path.")
    exit()

# 2. Calculate Core Engineered Features (Balance Differences)
df["balanceDiffOrig"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["balanceDiffDest"] = df["newbalanceDest"] - df["oldbalanceDest"]

# 3. Create the 'is_merchant' Behavioral Feature
df["is_merchant"] = df["nameDest"].str.startswith('M').astype(int)

# 4. Create the simplified velocity feature (Orig_Count_1step)
# This requires step and nameOrig
count_by_user_step = df.groupby(['nameOrig', 'step'])['amount'].count().reset_index()
count_by_user_step.rename(columns={'amount': 'Orig_Count_1step_Total'}, inplace=True)
df = df.merge(count_by_user_step, on=['nameOrig', 'step'], how='left')
df['Orig_Count_1step'] = df['Orig_Count_1step_Total'] - 1

# 5. Final Data Cleanup
df = df.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud', 'step', 'Orig_Count_1step_Total']) 

# --- 6. Define Data for Training ---
X = df.drop("isFraud", axis=1)
y = df["isFraud"]

# 7. Define Data Splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    stratify=y, 
    random_state=42
)
print("✅ Data loading, feature engineering, and splitting complete.")

✅ Data loading, feature engineering, and splitting complete.


### Execute Randomized Search

need to run the final code block that executes the Randomized Search using the parameters and scoring metric

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import numpy as np

# --- 1. Define Scale Weight (using the value you calculated) ---
scale_pos_weight = 777.64 

# --- 2. Define Preprocessing Pipeline (preprocessor) ---
numeric_features = [
    "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest",
    "balanceDiffOrig", "balanceDiffDest", "is_merchant", "Orig_Count_1step"
]
categorical_features = ["type"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ],
    remainder="drop"
)

# --- 3. Define the XGBoost Pipeline (xgb_pipeline) ---
# This is the object required for RandomizedSearchCV
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        scale_pos_weight=scale_pos_weight, # Apply the imbalance weight
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5 
    ))
])

print("✅ Preprocessor and XGBoost Pipeline objects defined.")

✅ Preprocessor and XGBoost Pipeline objects defined.


In [8]:
from scipy.stats import uniform, randint
from sklearn.metrics import recall_score, make_scorer

# --- 1. Define the parameters to search (Parameter Grid) ---
param_dist = {
    # Search for optimal number of trees (100 to 500)
    'classifier__n_estimators': randint(100, 500), 
    # Search for optimal learning rate (0.01 to 0.2)
    'classifier__learning_rate': uniform(0.01, 0.19),
    # Search for optimal tree depth (3 to 8)
    'classifier__max_depth': randint(3, 8),
    # Search for optimal regularization parameters
    'classifier__colsample_bytree': uniform(0.6, 0.4) 
}

# --- 2. Define the scoring metric (Crucial for Imbalanced Data) ---
# We choose Recall as the primary metric to ensure we maximize fraud capture.
scorer = make_scorer(recall_score)

print("✅ Parameter grid (param_dist) and scoring metric (scorer) defined.")

✅ Parameter grid (param_dist) and scoring metric (scorer) defined.


In [9]:
# Assuming the necessary imports (RandomizedSearchCV, etc.) are available from previous cells.

# 3. Setup RandomizedSearchCV
# n_iter=10 means we test 10 random combinations 
# cv=3 means we use 3-fold cross-validation
random_search = RandomizedSearchCV(
    xgb_pipeline, # Use the pipeline with the XGBoost classifier
    param_distributions=param_dist,
    n_iter=10,
    scoring=scorer,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1 # Use all available cores
)

# 4. Run the search (This step requires significant processing time)
print("\n--- Starting Randomized Search (Tuning XGBoost) ---")
random_search.fit(X_train, y_train)

# --- 5. Extract Best Parameters ---
print("\n--- Best Parameters Found ---")
print(random_search.best_params_)


--- Starting Randomized Search (Tuning XGBoost) ---
Fitting 3 folds for each of 10 candidates, totalling 30 fits

--- Best Parameters Found ---
{'classifier__colsample_bytree': np.float64(0.8832290311184181), 'classifier__learning_rate': np.float64(0.013911053916202464), 'classifier__max_depth': 4, 'classifier__n_estimators': 443}


### Define Final Optimized Ensemble Models

We redefine the models using the best parameters found and then define the Stacking Classifier pipeline.

In [10]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import recall_score, make_scorer

# Scale weight is needed for all base models
scale_pos_weight = 777.64

# --- 1. Define Optimized Base Models ---

# A. XGBoost (Optimized)
xgb_optimized = XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False,
    scale_pos_weight=scale_pos_weight,
    colsample_bytree=0.883,
    learning_rate=0.014,
    max_depth=4,
    n_estimators=443
)

# B. Random Forest (RF - Non-linear view)
rf_clf = RandomForestClassifier(
    random_state=42,
    class_weight='balanced',
    n_estimators=100, # Using default for speed
    max_depth=5
)

# C. Logistic Regression (LR - Linear view)
lr_clf = LogisticRegression(
    random_state=42,
    class_weight='balanced',
    solver='saga',
    max_iter=500 
)

# Define the base models for stacking
estimators = [
    ('xgb', xgb_optimized),
    ('rf', rf_clf),
    ('lr', lr_clf)
]

# --- 2. Define Final Stacking Pipeline ---
# Use Logistic Regression as the Meta-Model to combine the outputs
stack_clf = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression(solver='saga', max_iter=500),
    cv=3, # Use 3-fold cross-validation in the stacking process
    n_jobs=-1
)

# The full pipeline containing the preprocessor and the Stacking Classifier
final_ensemble_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', stack_clf)
])

print("✅ Final Ensemble Pipeline (Stacking) defined.")

✅ Final Ensemble Pipeline (Stacking) defined.


### Train and Evaluate the Ensemble

* run the following code block, which executes the training and generates the final performance metrics for your robust stacking model.

* This step involves training three powerful base models (XGBoost, Random Forest, Logistic Regression) and a meta-model, which will take longer than the single XGBoost model did.

In [11]:
# Train the Stacking Ensemble Model (This will take longer than the single XGBoost model)
print("\n--- Starting Final Ensemble Training ---")
final_ensemble_pipeline.fit(X_train, y_train)

# --- Final Evaluation ---
y_pred_ensemble = final_ensemble_pipeline.predict(X_test)

print("\n--- Stacking Ensemble Classification Report ---")
print(classification_report(y_test, y_pred_ensemble))

print("\n--- Stacking Ensemble Confusion Matrix ---")
print(confusion_matrix(y_test, y_pred_ensemble))


--- Starting Final Ensemble Training ---

--- Stacking Ensemble Classification Report ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    190634
           1       0.89      0.80      0.84       245

    accuracy                           1.00    190879
   macro avg       0.95      0.90      0.92    190879
weighted avg       1.00      1.00      1.00    190879


--- Stacking Ensemble Confusion Matrix ---
[[190611     23]
 [    50    195]]
