In [1]:
# MLOps Stage: Training the Robust Model
# This code integrates all feature engineering, scaling, and XGBoost training.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# --- I. Data Loading & Feature Engineering (The Robust Recipe) ---

# Load the Sampled Data (Ensure this file is in the same directory as your notebook)
df = pd.read_csv("E:\FraudPulse\Data\AIML_Sample_10Pct.csv") 

# 1. Calculate Core Engineered Features (Balance Differences)
df["balanceDiffOrig"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["balanceDiffDest"] = df["newbalanceDest"] - df["oldbalanceDest"]

# 2. Create the 'is_merchant' Behavioral Feature
df["is_merchant"] = df["nameDest"].str.startswith('M').astype(int)

# 3. Create the simplified velocity feature (Orig_Count_1step)
# This feature captures the number of transactions by the same user in the same hour (step - 1)
count_by_user_step = df.groupby(['nameOrig', 'step'])['amount'].count().reset_index()
count_by_user_step.rename(columns={'amount': 'Orig_Count_1step_Total'}, inplace=True)

df = df.merge(count_by_user_step, on=['nameOrig', 'step'], how='left')
df['Orig_Count_1step'] = df['Orig_Count_1step_Total'] - 1

# 4. Final Data Cleanup (Remove redundant/non-numeric columns)
df = df.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud', 'step', 'Orig_Count_1step_Total']) 

# --- II. Model Training Setup ---

X = df.drop("isFraud", axis=1)
y = df["isFraud"]

# 5. Data Splitting (Stratify handles the severe class imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    stratify=y, 
    random_state=42
)

# 6. Calculate Imbalance Weight (The key to fixing 0.02 Precision)
fraud_count = y_train.value_counts()[1]
non_fraud_count = y_train.value_counts()[0]
scale_pos_weight = non_fraud_count / fraud_count
print(f"Calculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

# 7. Define Preprocessing Pipeline (Imputation/Scaling/Encoding)
numeric_features = [
    "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest",
    "balanceDiffOrig", "balanceDiffDest", "is_merchant", "Orig_Count_1step"
]
categorical_features = ["type"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ],
    remainder="drop"
)

# 8. Build and Train the XGBoost Pipeline
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        scale_pos_weight=scale_pos_weight, # Apply the imbalance weight
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5 
    ))
])

# Train the model (This step takes time)
xgb_pipeline.fit(X_train, y_train)

# --- III. Evaluation ---
y_pred = xgb_pipeline.predict(X_test)

print("\n--- XGBoost Classification Report (New Features) ---")
print(classification_report(y_test, y_pred))

print("\n--- XGBoost Confusion Matrix (New Features) ---")
print(confusion_matrix(y_test, y_pred))

Calculated scale_pos_weight for XGBoost: 777.64

--- XGBoost Classification Report (New Features) ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    190634
           1       0.22      0.96      0.36       245

    accuracy                           1.00    190879
   macro avg       0.61      0.98      0.68    190879
weighted avg       1.00      1.00      1.00    190879


--- XGBoost Confusion Matrix (New Features) ---
[[189787    847]
 [     9    236]]


In [2]:
import joblib
import os

# Define the full path, ensuring the directory exists
model_path = r"E:\FraudPulse\models"
file_name = "fraud_detection_deployment_pipeline.pkl"
full_path = os.path.join(model_path, file_name)

# Create the directory if it does not exist
os.makedirs(model_path, exist_ok=True)

# Save the entire pipeline object to the specified location
joblib.dump(xgb_pipeline, full_path)

print(f"✅ Pipeline successfully saved to: {full_path}")

✅ Pipeline successfully saved to: E:\FraudPulse\models\fraud_detection_deployment_pipeline.pkl


In [3]:
import joblib
import pandas as pd
import numpy as np

# Define the exact path where you saved the model
MODEL_PATH = r"E:\FraudPulse\models\fraud_detection_deployment_pipeline.pkl"

# --- A. Load the Saved Pipeline ---
# This loads the preprocessor (scaling/encoding/imputation) AND the trained XGBoost model.
try:
    loaded_pipeline = joblib.load(MODEL_PATH)
    print("✅ Pipeline loaded successfully for testing.")
except FileNotFoundError:
    print(f"❌ Error: Model file not found at {MODEL_PATH}")
    exit()

# --- B. Simulate a New Transaction (Raw Data Input) ---
# A new transaction comes in from the bank's system.
# Note: It MUST include all the raw features our pipeline expects.
new_transaction_data = pd.DataFrame([{
    "type": "TRANSFER",
    "amount": 250000.00,
    "oldbalanceOrg": 300000.00,
    "newbalanceOrig": 50000.00,
    "oldbalanceDest": 1000.00,
    "newbalanceDest": 251000.00,
    "isFlaggedFraud": 0, # Not used in prediction, but included in raw input
    "step": 300,
    "nameOrig": "C_TEST_SENDER",
    "nameDest": "C_TEST_RECEIVER"
}])

# --- C. Feature Engineering on the New Transaction (In Real-Time) ---
# Your real-time API must calculate these engineered features instantly.
new_transaction_data["balanceDiffOrig"] = new_transaction_data["oldbalanceOrg"] - new_transaction_data["newbalanceOrig"]
new_transaction_data["balanceDiffDest"] = new_transaction_data["newbalanceDest"] - new_transaction_data["oldbalanceDest"]
new_transaction_data["is_merchant"] = new_transaction_data["nameDest"].str.startswith('M').astype(int)

# Create the velocity feature (In a real API, this would require querying the database for past transactions)
# For this test, we'll assume no recent activity (Count = 0).
new_transaction_data['Orig_Count_1step'] = 0

# Drop columns not used by the pipeline's ColumnTransformer
X_new = new_transaction_data.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])
X_new = X_new.drop(columns=['step']) # Drop raw time

# --- D. Predict ---
prediction = loaded_pipeline.predict(X_new)[0]
risk_score = loaded_pipeline.predict_proba(X_new)[0][1]

print("\n--- Prediction Output ---")
print(f"Predicted Class: {int(prediction)} (0=Safe, 1=Fraud)")
print(f"Confidence Score (Probability of Fraud): {risk_score:.4f}")

✅ Pipeline loaded successfully for testing.

--- Prediction Output ---
Predicted Class: 0 (0=Safe, 1=Fraud)
Confidence Score (Probability of Fraud): 0.0001
