In [None]:
# -------------------- Standard Library --------------------
import os
import json
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# -------------------- Core Scientific Libraries --------------------
import numpy as np
import pandas as pd

# -------------------- Visualization --------------------
import matplotlib.pyplot as plt
import seaborn as sns

# -------------------- Scikit-learn Core --------------------
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, precision_recall_curve, average_precision_score

# -------------------- Gradient Boosting --------------------
import xgboost as xgb

# -------------------- Imbalanced Learning --------------------
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, ADASYN

# -------------------- Experiment Tracking (MLflow) --------------------
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# -------------------- Model Persistence --------------------
import joblib

In [2]:
df = pd.read_csv(r'C:\Users\Asus\Downloads\Fraud_MLOps_Project\Data\payment_fraud.csv')
df.head()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend,label
0,29,1,4.745402,paypal,28.204861,shopping,0.0,0
1,725,1,4.742303,storecredit,0.0,electronics,0.0,0
2,845,1,4.921318,creditcard,0.0,food,1.0,0
3,503,1,4.886641,creditcard,0.0,electronics,1.0,0
4,2000,1,5.040929,creditcard,0.0,shopping,0.0,0


In [3]:
mlflow.set_tracking_uri("file:./mlruns")

# Model Sub Modules

In [4]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    """
    Custom feature engineering transformer.
    Controlled via steps_to_apply list:
    - 'feature_engineering': enable feature engineering
    - 'interaction': Category x PaymentMethod
    - 'ratio': paymentMethodAgeDays / accountAgeDays
    - 'binning': bins for accountAgeDays
    - 'time_feature': bins for localTime
    """
    def __init__(self, steps_to_apply=None):
        self.steps_to_apply = steps_to_apply or []

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Skip entire feature engineering if 'feature_engineering' not in steps
        if 'feature_engineering' not in self.steps_to_apply:
            return X

        # Interaction Feature
        if 'interaction' in self.steps_to_apply:
            if 'Category' in X.columns and 'paymentMethod' in X.columns:
                X['Category_Payment'] = X['Category'] + '_' + X['paymentMethod']

        # Ratio Feature
        if 'ratio' in self.steps_to_apply:
            if 'paymentMethodAgeDays' in X.columns and 'accountAgeDays' in X.columns:
                X['payment_account_ratio'] = X['paymentMethodAgeDays'] / (X['accountAgeDays'] + 1)

        # Binning Feature
        if 'binning' in self.steps_to_apply:
            if 'accountAgeDays' in X.columns:
                X['account_age_bin'] = pd.cut(
                    X['accountAgeDays'],
                    bins=[0, 90, 730, 2000],
                    labels=['new', 'medium', 'old']
                )

        # Time-of-day Feature
        if 'time_feature' in self.steps_to_apply:
            if 'localTime' in X.columns:
                bins = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]
                labels = ['early_morning', 'morning', 'afternoon', 'evening', 'night']
                X['time_of_day'] = pd.cut(X['localTime'], bins=bins, labels=labels)

        return X

In [5]:
# ----------- Log Transformer -----------
class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.log1p(np.array(X, dtype=float))

    def get_feature_names_out(self, input_features=None):
        return input_features


# ----------- Preprocessing Class -----------
class Preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_features, skewed_features, symmetric_features,
                 steps_to_apply=None, random_state=42):
        """
        Preprocessing pipeline (impute, encoding, log, scaling).
        SMOTE/ADASYN will be applied separately after this step.
        """
        self.categorical_features = categorical_features
        self.skewed_features = skewed_features
        self.symmetric_features = symmetric_features
        self.steps_to_apply = steps_to_apply or []
        self.random_state = random_state

        self.preprocessor = None  # ColumnTransformer will be built dynamically

    def _build_pipeline(self):
        """Build column transformer dynamically based on steps_to_apply."""

        # ----- Categorical pipeline -----
        cat_steps = []
        if 'impute' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            cat_steps.append(('imputer', SimpleImputer(strategy='most_frequent')))
        if 'encoding' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            cat_steps.append(('encoder', OneHotEncoder(handle_unknown='ignore', drop='first')))
        cat_pipeline = Pipeline(cat_steps) if cat_steps else 'passthrough'

        # ----- Skewed numerical pipeline -----
        skewed_steps = []
        if 'impute' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            skewed_steps.append(('imputer', SimpleImputer(strategy='median')))
        if 'log_transform' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            skewed_steps.append(('log', LogTransformer()))
        if 'encoding' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:  # scaling numeric
            skewed_steps.append(('scaler', StandardScaler()))
        skewed_pipeline = Pipeline(skewed_steps) if skewed_steps else 'passthrough'

        # ----- Symmetric numerical pipeline -----
        sym_steps = []
        if 'impute' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            sym_steps.append(('imputer', SimpleImputer(strategy='median')))
        if 'encoding' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            sym_steps.append(('scaler', MinMaxScaler()))
        sym_pipeline = Pipeline(sym_steps) if sym_steps else 'passthrough'

        # ----- Combine all pipelines -----
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('cat', cat_pipeline, self.categorical_features),
                ('skew', skewed_pipeline, self.skewed_features),
                ('sym', sym_pipeline, self.symmetric_features)
            ],
            remainder='drop'
        )

    def fit(self, X, y=None):
        self._build_pipeline()
        self.preprocessor.fit(X)
        return self

    def transform(self, X):
        """Transform WITHOUT resampling (pure preprocessing)."""
        return self.preprocessor.transform(X)

    def fit_transform(self, X, y=None):
        """Fit and transform WITHOUT resampling (SMOTE will be handled outside)."""
        self._build_pipeline()
        return self.preprocessor.fit_transform(X)

# Integrated Model Pipeline

In [6]:
# ------------------------- FRAUD PIPELINE CLASS -------------------------
class FraudPipeline:
    # ----- Step mappings -----
    FEATURE_ENG_SUBSTEPS = ['interaction', 'ratio', 'binning', 'time_feature']
    PREPROCESS_SUBSTEPS = ['encoding', 'impute', 'log_transform', 'smote']
    
    def __init__(self, steps_to_apply=None, model=None, test_size=0.2,
                 random_state=42, resample_method="smote", experiment_name="FraudDetection"):
        self.steps_to_apply = self.expand_steps(steps_to_apply)
        self.model = model or RandomForestClassifier(class_weight='balanced', random_state=random_state)
        self.test_size = test_size
        self.random_state = random_state
        self.resample_method = resample_method
        self.experiment_name = experiment_name

        # Pipeline placeholder
        self.pipeline = None
        self.best_threshold = 0.5  # Default threshold (updated after training)

        # Feature groups
        self.categorical = ['Category', 'paymentMethod', 'isWeekend']
        self.skewed = ['numItems', 'localTime', 'paymentMethodAgeDays']
        self.symmetric = ['accountAgeDays']
        self.target = 'label'

        # Initialize MLflow experiment
        mlflow.set_experiment(self.experiment_name)

    # ------------------------- STEP EXPANSION -------------------------
    def expand_steps(self, steps_to_apply):
        """Strict parent-substep logic with full-step auto expansion."""
        steps = set(steps_to_apply or [])

        # ---------- Feature Engineering ----------
        fe_substeps = steps.intersection(self.FEATURE_ENG_SUBSTEPS)

        if fe_substeps and 'feature_engineering' not in steps:
            raise ValueError(
                f"Feature engineering sub-steps {fe_substeps} provided without 'feature_engineering' parent step."
            )

        if 'feature_engineering' in steps:
            if fe_substeps:
                steps = steps  # Only chosen sub-steps
            else:
                steps.update(self.FEATURE_ENG_SUBSTEPS)  # Apply all sub-steps

        # ---------- Preprocessing ----------
        pre_substeps = steps.intersection(self.PREPROCESS_SUBSTEPS)

        if pre_substeps and 'preprocessing' not in steps:
            raise ValueError(
                f"Preprocessing sub-steps {pre_substeps} provided without 'preprocessing' parent step."
            )

        if 'preprocessing' in steps:
            if pre_substeps:
                steps = steps
            else:
                steps.update(self.PREPROCESS_SUBSTEPS)  # Apply all preprocessing steps

        return list(steps)

    # ------------------------- TRAIN -------------------------
    def train(self, df):
        """Fit full pipeline with no data leakage and log experiment metadata."""

        # --- Split ---
        X = df.drop(columns=[self.target])
        y = df[self.target]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=y, test_size=self.test_size, random_state=self.random_state, shuffle=True
        )

        # --- Feature Engineering Step ---
        feature_engineer = FeatureEngineering(steps_to_apply=self.steps_to_apply) \
            if any(s in self.steps_to_apply for s in self.FEATURE_ENG_SUBSTEPS) else 'passthrough'

        # --- Preprocessing Step ---
        preprocessor = Preprocessing(
            self.categorical, self.skewed, self.symmetric,
            steps_to_apply=self.steps_to_apply,
            random_state=self.random_state
        ) if any(s in self.steps_to_apply for s in self.PREPROCESS_SUBSTEPS) else 'passthrough'

        # --- Build pipeline (no resampling in pipeline) ---
        self.pipeline = ImbPipeline([
            ('feature_engineering', feature_engineer),
            ('preprocessing', preprocessor),
            ('model', self.model)
        ])

        # --- Transform Train/Test separately (no leakage) ---
        X_train_transformed = self.pipeline[:-1].fit_transform(X_train, y_train)
        X_test_transformed = self.pipeline[:-1].transform(X_test)

        # --- Apply SMOTE/ADASYN only on Train ---
        if 'smote' in self.steps_to_apply:
            if self.resample_method == "smote":
                sampler = SMOTE(random_state=self.random_state, sampling_strategy='minority', k_neighbors=5)
            elif self.resample_method == "adasyn":
                sampler = ADASYN(random_state=self.random_state, sampling_strategy='minority', n_neighbors=5)
            else:
                raise ValueError("resample_method must be 'smote' or 'adasyn'")

            X_train_transformed, y_train = sampler.fit_resample(X_train_transformed, y_train)

        # --- Train Model ---
        if 'model_training' in self.steps_to_apply:
            with mlflow.start_run(run_name=f"{type(self.model).__name__}_run"):
                # -------- Log Parameters --------
                mlflow.log_param("steps_to_apply", self.steps_to_apply)
                mlflow.log_param("resample_method", self.resample_method)
                mlflow.log_param("test_size", self.test_size)
                mlflow.log_param("model", type(self.model).__name__)
                mlflow.log_param("categorical_features", self.categorical)
                mlflow.log_param("skewed_features", self.skewed)
                mlflow.log_param("symmetric_features", self.symmetric)

                # -------- Train and Evaluate --------
                self.model.fit(X_train_transformed, y_train)

                # Threshold tuning using TRAIN probabilities
                y_train_proba = self.model.predict_proba(X_train_transformed)[:, 1]
                precision, recall, thresholds = precision_recall_curve(y_train, y_train_proba)
                f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
                best_idx = f1_scores.argmax()
                self.best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
                mlflow.log_param("optimal_threshold", self.best_threshold)

                print(f"Optimal Threshold (from train): {self.best_threshold:.3f} "
                      f"(Precision={precision[best_idx]:.3f}, Recall={recall[best_idx]:.3f})")

                # Evaluate train/test metrics (default 0.5 threshold)
                y_train_pred = self.model.predict(X_train_transformed)
                y_test_pred = self.model.predict(X_test_transformed)

                train_metrics = self._calculate_metrics(y_train, y_train_pred, prefix="train")
                test_metrics = self._calculate_metrics(y_test, y_test_pred, prefix="test")
                self._log_metrics(train_metrics)
                self._log_metrics(test_metrics)

                # Log PR-AUC on TEST set
                y_test_proba = self.model.predict_proba(X_test_transformed)[:, 1]
                pr_auc = average_precision_score(y_test, y_test_proba)
                mlflow.log_metric("test_pr_auc", pr_auc)

                # --- Save PR curve ---
                precision_test, recall_test, _ = precision_recall_curve(y_test, y_test_proba)
                plt.figure(figsize=(6, 4))
                plt.plot(recall_test, precision_test, label=f'PR curve (AUC={pr_auc:.3f})')
                plt.xlabel('Recall')
                plt.ylabel('Precision')
                plt.title('Precision-Recall Curve (Test)')
                plt.legend()
                plt.grid(True)
                pr_curve_path = "artifacts/pr_curve.png"
                os.makedirs("artifacts", exist_ok=True)
                plt.savefig(pr_curve_path, bbox_inches='tight')
                plt.close()
                mlflow.log_artifact(pr_curve_path, "precision_recall_curve")

                # --- Confusion matrix ---
                cm = confusion_matrix(y_test, y_test_pred)
                fig, ax = plt.subplots(figsize=(6, 4))
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
                ax.set_title('Confusion Matrix')
                ax.set_xlabel('Predicted')
                ax.set_ylabel('Actual')
                cm_png_path = "artifacts/confusion_matrix.png"
                plt.savefig(cm_png_path, bbox_inches='tight')
                plt.close(fig)
                mlflow.log_artifact(cm_png_path, "confusion_matrix")

                # --- Log data distribution ---
                class_dist = y.value_counts(normalize=True).to_dict()
                mlflow.log_param("class_distribution", json.dumps(class_dist))

                # --- Log model to MLflow ---
                signature = infer_signature(X_train_transformed, self.model.predict(X_train_transformed))
                mlflow.sklearn.log_model(self.pipeline, name="fraud_pipeline", signature=signature)

                # Save pipeline locally
                # joblib.dump(self.pipeline, "artifacts/fraud_pipeline.pkl")
                joblib.dump(self, "artifacts/fraud_pipeline.pkl")
                mlflow.log_artifacts("artifacts")

        return self.pipeline, X_train_transformed, y_train, X_test_transformed, y_test

    # ------------------------- PREPROCESS -------------------------
    def preprocess(self, df):
        if not self.pipeline:
            raise ValueError("Pipeline not trained. Train first or load fitted pipeline.")
        return self.pipeline[:-1].transform(df)

    # ------------------------- PREDICT -------------------------
    def predict(self, df, use_optimal_threshold=False):
        if not self.pipeline:
            raise ValueError("Pipeline not trained. Train first or load fitted pipeline.")
        
        # Transform if DataFrame
        transformed = self.pipeline[:-1].transform(df) if isinstance(df, pd.DataFrame) else df

        if use_optimal_threshold:
            probs = self.pipeline[-1].predict_proba(transformed)[:, 1]
            return (probs >= self.best_threshold).astype(int)
        else:
            return self.pipeline[-1].predict(transformed)

    # ------------------------- PREDICT PROBA -------------------------
    def predict_proba(self, df):
        """Return fraud probability predictions (class 1)."""
        if not self.pipeline:
            raise ValueError("Pipeline not trained. Train first or load fitted pipeline.")
        
        transformed = self.pipeline[:-1].transform(df) if isinstance(df, pd.DataFrame) else df
        return self.pipeline[-1].predict_proba(transformed)[:, 1]

    # ------------------------- METRICS UTILS -------------------------
    def _calculate_metrics(self, y_true, y_pred, prefix=""):
        return {
            f"{prefix}_accuracy": accuracy_score(y_true, y_pred),
            f"{prefix}_precision": precision_score(y_true, y_pred),
            f"{prefix}_recall": recall_score(y_true, y_pred),
            f"{prefix}_f1": f1_score(y_true, y_pred)
        }

    def _log_metrics(self, metrics):
        for k, v in metrics.items():
            mlflow.log_metric(k, v)


# Training & Validation

In [7]:
def evaluate_model(y_true, y_pred, dataset_name="Evaluation", is_proba=False):
    """
    Prints and returns classification metrics with optional PR-AUC and threshold tuning.
    
    Parameters
    ----------
    y_true : array-like
        True labels.
    y_pred : array-like
        Predicted labels or probabilities.
    dataset_name : str
        Name for the dataset (used in print titles and keys).
    is_proba : bool
        If True, `y_pred` is treated as probabilities for PR-AUC and threshold tuning.
    """
    best_threshold = 0.5  # Default threshold

    if is_proba:
        # --- Compute PR-AUC ---
        pr_auc = average_precision_score(y_true, y_pred)
        precision, recall, thresholds = precision_recall_curve(y_true, y_pred)

        # --- Find best threshold (maximize F1) ---
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
        best_idx = f1_scores.argmax()
        best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5

        print(f"\n--- {dataset_name} PR-AUC: {pr_auc:.4f} ---")
        print(f"Optimal Threshold: {best_threshold:.3f} "
              f"(Precision={precision[best_idx]:.3f}, Recall={recall[best_idx]:.3f})")

        # Convert probabilities to binary using best threshold
        y_pred = (y_pred >= best_threshold).astype(int)

    # --- Classification report ---
    print(f"\n--- {dataset_name} Classification Report ---")
    print(classification_report(y_true, y_pred))

    # --- Compute summary metrics ---
    accuracy = accuracy_score(y_true, y_pred)
    precision_val = precision_score(y_true, y_pred)
    recall_val = recall_score(y_true, y_pred)
    f1_val = f1_score(y_true, y_pred)

    print(f"{dataset_name} Accuracy:  {accuracy:.4f}")
    print(f"{dataset_name} Precision: {precision_val:.4f}")
    print(f"{dataset_name} Recall:    {recall_val:.4f}")
    print(f"{dataset_name} F1 Score:   {f1_val:.4f}")

    # --- Return results ---
    result = {
        f"{dataset_name}_accuracy": accuracy,
        f"{dataset_name}_precision": precision_val,
        f"{dataset_name}_recall": recall_val,
        f"{dataset_name}_f1": f1_val,
    }
    if is_proba:
        result[f"{dataset_name}_pr_auc"] = pr_auc
        result[f"{dataset_name}_best_threshold"] = best_threshold

    return result


# ----------------------- Step 1: Create Hold-out Set -----------------------
df_clean = df.drop_duplicates(keep='first')

n1, n2 = 99, 1
n = 50

holdout_class_0_A = df_clean[df_clean['label'] == 0].sample(n1, random_state=42)
holdout_class_1_A = df_clean[df_clean['label'] == 1].sample(n2, random_state=42)
holdout_df_A = pd.concat([holdout_class_0_A, holdout_class_1_A])

train_df = df_clean.drop(holdout_df_A.index)

X_holdout_A = holdout_df_A.drop(columns=['label'])
y_holdout_A = holdout_df_A['label']

holdout_class_0_B = train_df[train_df['label'] == 0].sample(n1, random_state=42)
holdout_class_1_B = train_df[train_df['label'] == 1].sample(n2, random_state=42)
holdout_df_B = pd.concat([holdout_class_0_B, holdout_class_1_B])

train_df = train_df.drop(holdout_df_B.index)

X_holdout_B = holdout_df_B.drop(columns=['label'])
y_holdout_B = holdout_df_B['label']

holdout_class_0_C = train_df[train_df['label'] == 0].sample(n, random_state=42)
holdout_class_1_C = train_df[train_df['label'] == 1].sample(n, random_state=42)
holdout_df_C = pd.concat([holdout_class_0_C, holdout_class_1_C])

train_df = train_df.drop(holdout_df_C.index)

X_holdout_C = holdout_df_C.drop(columns=['label'])
y_holdout_C = holdout_df_C['label']

print(f'Orignal data size: {df.shape}')
print(f'Cleaned (without duplicates) data size: {df_clean.shape}')
print(f"Training data size: {train_df.shape}")
print(f"Hold-out data A size: {holdout_df_A.shape}")
print(f"Hold-out data B size: {holdout_df_B.shape}")
print(f"Hold-out data C size: {holdout_df_B.shape}")

# ----------------------- Step 2: Initialize FraudPipeline -----------------------
fp = FraudPipeline(
    steps_to_apply=[
        'feature_engineering', 
        # 'interaction', 
        # 'ratio', 
        # 'binning', 
        # 'time_feature',
        
        'preprocessing', 
        # 'encoding', 
        # 'impute', 
        # 'log_transform', 
        # 'smote', 
        
        'model_training',
    ],
    resample_method='smote',
    # model=xgb.XGBClassifier(),
    model=LogisticRegression(),
)

# ----------------------- Step 3: Train Pipeline -----------------------
pipeline, X_train, y_train, X_test, y_test = fp.train(train_df)
print('Best Threshold: ', fp.best_threshold)
# ----------------------- Step 4: Evaluate on Internal Test Split -----------------------
# Get probabilities and apply stored threshold
test_probabilities = fp.predict_proba(X_test)
test_preds = (test_probabilities >= fp.best_threshold).astype(int)
test_metrics = evaluate_model(y_test, test_preds, dataset_name="Internal Test Split")

# ----------------------- Step 5: Evaluate on Hold-out Sets -----------------------
# Hold-out Set A
holdout_prob_A = fp.predict_proba(X_holdout_A)
holdout_preds_A = (holdout_prob_A >= fp.best_threshold).astype(int)
holdout_metrics_A = evaluate_model(y_holdout_A, holdout_preds_A, dataset_name="Hold-out Set A")

# Hold-out Set B
holdout_prob_B = fp.predict_proba(X_holdout_B)
holdout_preds_B = (holdout_prob_B >= fp.best_threshold).astype(int)
holdout_metrics_B = evaluate_model(y_holdout_B, holdout_preds_B, dataset_name="Hold-out Set B")

# Hold-out Set C
holdout_prob_C = fp.predict_proba(X_holdout_C)
holdout_preds_C = (holdout_prob_C >= fp.best_threshold).astype(int)
holdout_metrics_C = evaluate_model(y_holdout_C, holdout_preds_C, dataset_name="Hold-out Set C")

# ----------------------- Step 6: Save Pipeline -----------------------
joblib.dump(fp.pipeline, "fraud_pipeline_deployed.pkl")
print("\nPipeline saved as 'fraud_pipeline_deployed.pkl'")

# ----------------------- Step 7: Load & Predict (Example) -----------------------
loaded_pipeline = joblib.load("fraud_pipeline_deployed.pkl")
sample_preds_A = loaded_pipeline.predict(X_holdout_A)
sample_preds_B = loaded_pipeline.predict(X_holdout_B)
sample_preds_C = loaded_pipeline.predict(X_holdout_C)

display(pd.DataFrame({'Predictions': sample_preds_A, 'True_Value': y_holdout_A}).head(100))
display(pd.DataFrame({'Predictions': sample_preds_B, 'True_Value': y_holdout_B}).head(100))
display(pd.DataFrame({'Predictions': sample_preds_C, 'True_Value': y_holdout_C}).head(100))

Orignal data size: (39221, 8)
Cleaned (without duplicates) data size: (36188, 8)
Training data size: (35888, 8)
Hold-out data A size: (100, 8)
Hold-out data B size: (100, 8)
Hold-out data C size: (100, 8)
Optimal Threshold (from train): 0.837 (Precision=0.961, Recall=0.986)
Best Threshold:  0.8367815357592582

--- Internal Test Split Classification Report ---
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      7109
           1       0.19      0.99      0.33        69

    accuracy                           0.96      7178
   macro avg       0.60      0.97      0.65      7178
weighted avg       0.99      0.96      0.97      7178

Internal Test Split Accuracy:  0.9607
Internal Test Split Precision: 0.1948
Internal Test Split Recall:    0.9855
Internal Test Split F1 Score:   0.3254

--- Hold-out Set A Classification Report ---
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        99
   

Unnamed: 0,Predictions,True_Value
35999,0,0
18192,1,0
5589,0,0
18168,0,0
24727,0,0
...,...,...
25217,1,0
8568,0,0
7560,0,0
28968,0,0


Unnamed: 0,Predictions,True_Value
6153,0,0
660,0,0
26114,0,0
35825,0,0
13314,0,0
...,...,...
19651,0,0
13654,0,0
9782,0,0
13045,0,0


Unnamed: 0,Predictions,True_Value
36787,0,0
17068,0,0
32649,0,0
34687,0,0
20942,0,0
...,...,...
6987,1,1
3375,1,1
30752,1,1
8246,1,1


In [8]:
print('HOLD OUT A DATA')
print(pd.DataFrame({'Predictions': sample_preds_A, 'True_Value': y_holdout_A})['True_Value'].value_counts())
print(pd.DataFrame({'Predictions': sample_preds_A, 'True_Value': y_holdout_A})['Predictions'].value_counts())
print('HOLD OUT B DATA')
print(pd.DataFrame({'Predictions': sample_preds_B, 'True_Value': y_holdout_B})['True_Value'].value_counts())
print(pd.DataFrame({'Predictions': sample_preds_B, 'True_Value': y_holdout_B})['Predictions'].value_counts())
print('HOLD OUT C DATA')
print(pd.DataFrame({'Predictions': sample_preds_C, 'True_Value': y_holdout_C})['True_Value'].value_counts())
print(pd.DataFrame({'Predictions': sample_preds_C, 'True_Value': y_holdout_C})['Predictions'].value_counts())

HOLD OUT A DATA
True_Value
0    99
1     1
Name: count, dtype: int64
Predictions
0    89
1    11
Name: count, dtype: int64
HOLD OUT B DATA
True_Value
0    99
1     1
Name: count, dtype: int64
Predictions
0    94
1     6
Name: count, dtype: int64
HOLD OUT C DATA
True_Value
0    50
1    50
Name: count, dtype: int64
Predictions
1    51
0    49
Name: count, dtype: int64


# Loading and Testing

In [9]:
# Load the full FraudPipeline object
fp_loaded = joblib.load("artifacts/fraud_pipeline.pkl")

# fp_loaded.best_threshold = 0.9

# Use predict with stored best_threshold
preds_A = fp_loaded.predict(X_holdout_A, use_optimal_threshold=True)
preds_B = fp_loaded.predict(X_holdout_B, use_optimal_threshold=True)
preds_C = fp_loaded.predict(X_holdout_C, use_optimal_threshold=True)

# Evaluate
evaluate_model(y_holdout_A, preds_A, dataset_name="Hold-out A")
evaluate_model(y_holdout_B, preds_B, dataset_name="Hold-out B")
evaluate_model(y_holdout_C, preds_C, dataset_name="Hold-out C")

print('HOLD OUT A DATA')
print(pd.DataFrame({'Predictions': preds_A, 'True_Value': y_holdout_A})['True_Value'].value_counts())
print(pd.DataFrame({'Predictions': preds_A, 'True_Value': y_holdout_A})['Predictions'].value_counts())
print('HOLD OUT B DATA')
print(pd.DataFrame({'Predictions': preds_B, 'True_Value': y_holdout_B})['True_Value'].value_counts())
print(pd.DataFrame({'Predictions': preds_B, 'True_Value': y_holdout_B})['Predictions'].value_counts())
print('HOLD OUT C DATA')
print(pd.DataFrame({'Predictions': preds_C, 'True_Value': y_holdout_C})['True_Value'].value_counts())
print(pd.DataFrame({'Predictions': preds_C, 'True_Value': y_holdout_C})['Predictions'].value_counts())

print('Evaluation DONE')


--- Hold-out A Classification Report ---
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        99
           1       0.25      1.00      0.40         1

    accuracy                           0.97       100
   macro avg       0.62      0.98      0.69       100
weighted avg       0.99      0.97      0.98       100

Hold-out A Accuracy:  0.9700
Hold-out A Precision: 0.2500
Hold-out A Recall:    1.0000
Hold-out A F1 Score:   0.4000

--- Hold-out B Classification Report ---
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        99
           1       0.50      1.00      0.67         1

    accuracy                           0.99       100
   macro avg       0.75      0.99      0.83       100
weighted avg       0.99      0.99      0.99       100

Hold-out B Accuracy:  0.9900
Hold-out B Precision: 0.5000
Hold-out B Recall:    1.0000
Hold-out B F1 Score:   0.6667

--- Hold-out C Classificati

# Overview

In [10]:
# --- Split your dataset ---
X = df.drop(columns=['label'])
y = df['label']

# Instantiate FeatureEngineering
fe = FeatureEngineering(steps_to_apply=['feature_engineering', 'interaction', 'ratio', 'binning', 'time_feature'])

# BEFORE: Inspect original columns
print("Columns BEFORE Feature Engineering:")
print(list(X.columns))

# Apply Feature Engineering
X_transformed = fe.fit_transform(X)

# AFTER: Inspect new columns
print("\nColumns AFTER Feature Engineering:")
print(list(X_transformed.columns))

# Compare column difference
added_columns = set(X_transformed.columns) - set(X.columns)
print("\nNewly added columns:", added_columns)

# Show head of transformed data
print("\nSample transformed data:")
display(X_transformed.head())

Columns BEFORE Feature Engineering:
['accountAgeDays', 'numItems', 'localTime', 'paymentMethod', 'paymentMethodAgeDays', 'Category', 'isWeekend']

Columns AFTER Feature Engineering:
['accountAgeDays', 'numItems', 'localTime', 'paymentMethod', 'paymentMethodAgeDays', 'Category', 'isWeekend', 'Category_Payment', 'payment_account_ratio', 'account_age_bin', 'time_of_day']

Newly added columns: {'account_age_bin', 'time_of_day', 'payment_account_ratio', 'Category_Payment'}

Sample transformed data:


Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend,Category_Payment,payment_account_ratio,account_age_bin,time_of_day
0,29,1,4.745402,paypal,28.204861,shopping,0.0,shopping_paypal,0.940162,new,night
1,725,1,4.742303,storecredit,0.0,electronics,0.0,electronics_storecredit,0.0,medium,night
2,845,1,4.921318,creditcard,0.0,food,1.0,food_creditcard,0.0,old,night
3,503,1,4.886641,creditcard,0.0,electronics,1.0,electronics_creditcard,0.0,medium,night
4,2000,1,5.040929,creditcard,0.0,shopping,0.0,shopping_creditcard,0.0,old,night


In [11]:
def get_feature_names_from_column_transformer(column_transformer):
    output_features = []
    for name, pipe, features in column_transformer.transformers_:
        if name != 'remainder':
            if hasattr(pipe, 'named_steps'):
                last_step = list(pipe.named_steps.values())[-1]
                if hasattr(last_step, 'get_feature_names_out'):
                    feature_names = last_step.get_feature_names_out(features)
                else:
                    feature_names = features
            else:
                feature_names = features
            output_features.extend(feature_names)
        else:
            output_features.extend(features)  # passthrough features
    return output_features

# Assume you have X (original features) and Preprocessing object
pre = Preprocessing(
    categorical_features=['Category', 'paymentMethod', 'isWeekend'],
    skewed_features=['numItems', 'localTime', 'paymentMethodAgeDays'],
    symmetric_features=['accountAgeDays'],
    steps_to_apply=['preprocessing']  # full preprocessing
)

# BEFORE: original data
print("Original Columns:", X.columns.tolist())
display(X.head())

# Fit + transform
X_transformed = pre.fit_transform(X)

# Get transformed column names
transformed_cols = get_feature_names_from_column_transformer(pre.preprocessor)

# AFTER: transformed data
print("\nTransformed Shape:", X_transformed.shape)
print("Transformed Columns (after encoding, scaling, log):")
print(transformed_cols)

# Convert transformed array to DataFrame for easy inspection
X_transformed_df = pd.DataFrame(X_transformed, columns=transformed_cols)

# Show first few rows
display(X_transformed_df.head())


Original Columns: ['accountAgeDays', 'numItems', 'localTime', 'paymentMethod', 'paymentMethodAgeDays', 'Category', 'isWeekend']


Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend
0,29,1,4.745402,paypal,28.204861,shopping,0.0
1,725,1,4.742303,storecredit,0.0,electronics,0.0
2,845,1,4.921318,creditcard,0.0,food,1.0
3,503,1,4.886641,creditcard,0.0,electronics,1.0
4,2000,1,5.040929,creditcard,0.0,shopping,0.0



Transformed Shape: (39221, 9)
Transformed Columns (after encoding, scaling, log):
['Category_food', 'Category_shopping', 'paymentMethod_paypal', 'paymentMethod_storecredit', 'isWeekend_1.0', 'numItems', 'localTime', 'paymentMethodAgeDays', 'accountAgeDays']


Unnamed: 0,Category_food,Category_shopping,paymentMethod_paypal,paymentMethod_storecredit,isWeekend_1.0,numItems,localTime,paymentMethodAgeDays,accountAgeDays
0,0.0,1.0,1.0,0.0,0.0,-0.189142,0.028772,0.557839,0.014007
1,0.0,0.0,0.0,1.0,0.0,-0.189142,0.021742,-0.775564,0.362181
2,1.0,0.0,0.0,0.0,1.0,-0.189142,0.421745,-0.775564,0.422211
3,0.0,0.0,0.0,0.0,1.0,-0.189142,0.345213,-0.775564,0.251126
4,0.0,1.0,0.0,0.0,0.0,-0.189142,0.682328,-0.775564,1.0


In [12]:
def get_preprocessed_dataframe(preprocessor_obj, X):
    """
    Returns preprocessed dataframe with column names after transformation.
    """
    # Fit and transform
    X_transformed = preprocessor_obj.fit_transform(X)
    
    # Extract feature names
    feature_names = get_feature_names_from_column_transformer(preprocessor_obj.preprocessor)
    
    # Convert to DataFrame
    preprocessed_df = pd.DataFrame(X_transformed, columns=feature_names, index=X.index)
    
    return preprocessed_df


# ---------- Usage ----------
# Initialize Preprocessing object
pre = Preprocessing(
    categorical_features=['Category', 'paymentMethod', 'isWeekend'],
    skewed_features=['numItems', 'localTime', 'paymentMethodAgeDays'],
    symmetric_features=['accountAgeDays'],
    steps_to_apply=['preprocessing']  # includes impute, encoding, scaling, log
)

# Get preprocessed DataFrame
preprocessed_df = get_preprocessed_dataframe(pre, X)

# Show
print("Preprocessed columns:")
print(preprocessed_df.columns.tolist())
display(preprocessed_df.head())

Preprocessed columns:
['Category_food', 'Category_shopping', 'paymentMethod_paypal', 'paymentMethod_storecredit', 'isWeekend_1.0', 'numItems', 'localTime', 'paymentMethodAgeDays', 'accountAgeDays']


Unnamed: 0,Category_food,Category_shopping,paymentMethod_paypal,paymentMethod_storecredit,isWeekend_1.0,numItems,localTime,paymentMethodAgeDays,accountAgeDays
0,0.0,1.0,1.0,0.0,0.0,-0.189142,0.028772,0.557839,0.014007
1,0.0,0.0,0.0,1.0,0.0,-0.189142,0.021742,-0.775564,0.362181
2,1.0,0.0,0.0,0.0,1.0,-0.189142,0.421745,-0.775564,0.422211
3,0.0,0.0,0.0,0.0,1.0,-0.189142,0.345213,-0.775564,0.251126
4,0.0,1.0,0.0,0.0,0.0,-0.189142,0.682328,-0.775564,1.0


In [13]:
# Initialize FeatureEngineering with chosen steps
fe = FeatureEngineering(
    steps_to_apply=['feature_engineering', 'interaction', 'ratio', 'binning', 'time_feature']
)

# Apply feature engineering
X_fe = fe.fit_transform(X)

print("Feature Engineered Columns:")
print(X_fe.columns.tolist())
display(X_fe.head())

# Initialize Preprocessing with chosen steps
pre = Preprocessing(
    categorical_features=['Category', 'paymentMethod', 'isWeekend', 'Category_Payment', 'account_age_bin', 'time_of_day'],
    skewed_features=['numItems', 'localTime', 'paymentMethodAgeDays'],
    symmetric_features=['accountAgeDays', 'payment_account_ratio'],
    steps_to_apply=['preprocessing']  # includes impute, encoding, log, scaling
)

# Fit & transform feature engineered data
X_preprocessed = pre.fit_transform(X_fe)

# Extract transformed feature names
feature_names = get_feature_names_from_column_transformer(pre.preprocessor)

# Convert to DataFrame
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names, index=X_fe.index)

print("\nPreprocessed Columns:")
print(X_preprocessed_df.columns.tolist())
display(X_preprocessed_df.head())

Feature Engineered Columns:
['accountAgeDays', 'numItems', 'localTime', 'paymentMethod', 'paymentMethodAgeDays', 'Category', 'isWeekend', 'Category_Payment', 'payment_account_ratio', 'account_age_bin', 'time_of_day']


Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend,Category_Payment,payment_account_ratio,account_age_bin,time_of_day
0,29,1,4.745402,paypal,28.204861,shopping,0.0,shopping_paypal,0.940162,new,night
1,725,1,4.742303,storecredit,0.0,electronics,0.0,electronics_storecredit,0.0,medium,night
2,845,1,4.921318,creditcard,0.0,food,1.0,food_creditcard,0.0,old,night
3,503,1,4.886641,creditcard,0.0,electronics,1.0,electronics_creditcard,0.0,medium,night
4,2000,1,5.040929,creditcard,0.0,shopping,0.0,shopping_creditcard,0.0,old,night



Preprocessed Columns:
['Category_food', 'Category_shopping', 'paymentMethod_paypal', 'paymentMethod_storecredit', 'isWeekend_1.0', 'Category_Payment_electronics_paypal', 'Category_Payment_electronics_storecredit', 'Category_Payment_food_creditcard', 'Category_Payment_food_paypal', 'Category_Payment_food_storecredit', 'Category_Payment_shopping_creditcard', 'Category_Payment_shopping_paypal', 'Category_Payment_shopping_storecredit', 'account_age_bin_new', 'account_age_bin_old', 'time_of_day_evening', 'time_of_day_night', 'numItems', 'localTime', 'paymentMethodAgeDays', 'accountAgeDays', 'payment_account_ratio']


Unnamed: 0,Category_food,Category_shopping,paymentMethod_paypal,paymentMethod_storecredit,isWeekend_1.0,Category_Payment_electronics_paypal,Category_Payment_electronics_storecredit,Category_Payment_food_creditcard,Category_Payment_food_paypal,Category_Payment_food_storecredit,...,Category_Payment_shopping_storecredit,account_age_bin_new,account_age_bin_old,time_of_day_evening,time_of_day_night,numItems,localTime,paymentMethodAgeDays,accountAgeDays,payment_account_ratio
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,-0.189142,0.028772,0.557839,0.014007,0.940649
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.189142,0.021742,-0.775564,0.362181,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,-0.189142,0.421745,-0.775564,0.422211,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.189142,0.345213,-0.775564,0.251126,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,-0.189142,0.682328,-0.775564,1.0,0.0
