In [None]:
# MLOps Stage: Training the Robust Model
# This code integrates all feature engineering, scaling, and XGBoost training.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# --- I. Data Loading & Feature Engineering (The Robust Recipe) ---

# Load the Sampled Data (Ensure this file is in the same directory as your notebook)
df = pd.read_csv("AIML_Sample_10Pct.csv") 

# 1. Calculate Core Engineered Features (Balance Differences)
df["balanceDiffOrig"] = df["oldbalanceOrg"] - df["newbalanceOrig"]
df["balanceDiffDest"] = df["newbalanceDest"] - df["oldbalanceDest"]

# 2. Create the 'is_merchant' Behavioral Feature
df["is_merchant"] = df["nameDest"].str.startswith('M').astype(int)

# 3. Create the simplified velocity feature (Orig_Count_1step)
# This feature captures the number of transactions by the same user in the same hour (step - 1)
count_by_user_step = df.groupby(['nameOrig', 'step'])['amount'].count().reset_index()
count_by_user_step.rename(columns={'amount': 'Orig_Count_1step_Total'}, inplace=True)

df = df.merge(count_by_user_step, on=['nameOrig', 'step'], how='left')
df['Orig_Count_1step'] = df['Orig_Count_1step_Total'] - 1

# 4. Final Data Cleanup (Remove redundant/non-numeric columns)
df = df.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud', 'step', 'Orig_Count_1step_Total']) 

# --- II. Model Training Setup ---

X = df.drop("isFraud", axis=1)
y = df["isFraud"]

# 5. Data Splitting (Stratify handles the severe class imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    stratify=y, 
    random_state=42
)

# 6. Calculate Imbalance Weight (The key to fixing 0.02 Precision)
fraud_count = y_train.value_counts()[1]
non_fraud_count = y_train.value_counts()[0]
scale_pos_weight = non_fraud_count / fraud_count
print(f"Calculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

# 7. Define Preprocessing Pipeline (Imputation/Scaling/Encoding)
numeric_features = [
    "amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest",
    "balanceDiffOrig", "balanceDiffDest", "is_merchant", "Orig_Count_1step"
]
categorical_features = ["type"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ],
    remainder="drop"
)

# 8. Build and Train the XGBoost Pipeline
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        scale_pos_weight=scale_pos_weight, # Apply the imbalance weight
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5 
    ))
])

# Train the model (This step takes time)
xgb_pipeline.fit(X_train, y_train)

# --- III. Evaluation ---
y_pred = xgb_pipeline.predict(X_test)

print("\n--- XGBoost Classification Report (New Features) ---")
print(classification_report(y_test, y_pred))

print("\n--- XGBoost Confusion Matrix (New Features) ---")
print(confusion_matrix(y_test, y_pred))