In [1]:
# -------------------- Standard Library --------------------
import os
import glob
import math
import itertools
import json
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# -------------------- Core Scientific Libraries --------------------
import numpy as np
import pandas as pd

# -------------------- Visualization --------------------
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# -------------------- Statistical Analysis --------------------
from scipy.stats import skew, shapiro, kurtosis, chi2_contingency, f_oneway, kruskal

# -------------------- Scikit-learn Core --------------------
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# -------------------- Gradient Boosting --------------------
import xgboost as xgb

# -------------------- Imbalanced Learning --------------------
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, ADASYN

# -------------------- Experiment Tracking (MLflow) --------------------
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# -------------------- Model Persistence --------------------
import joblib

In [23]:
df = pd.read_csv(r'C:\Users\Asus\Downloads\Fraud_MLOps_Project\Data\payment_fraud.csv')
df.head()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend,label
0,29,1,4.745402,paypal,28.204861,shopping,0.0,0
1,725,1,4.742303,storecredit,0.0,electronics,0.0,0
2,845,1,4.921318,creditcard,0.0,food,1.0,0
3,503,1,4.886641,creditcard,0.0,electronics,1.0,0
4,2000,1,5.040929,creditcard,0.0,shopping,0.0,0


In [3]:
# d = dtale.show(df, ignore_duplicate=False)
# d.open_browser()
# d

In [4]:
class EDA:
    def __init__(self, df, report_path="eda_results", outlier_method="iqr", threshold=1.5):
        """
        outlier_method: 'iqr' or 'zscore'
        threshold: For IQR -> multiplier (1.5), for Z-Score -> sigma (3)
        """
        self.df = df.copy()  # Read-only
        self.report_path = report_path
        self.outlier_method = outlier_method
        self.threshold = threshold
        self.target_col = None

        # Folder structure
        folders = [
            "missing_data", "cardinality", "numerical_analysis", "categorical_analysis",
            "relationships", "outliers", "distribution_shape", "target_analysis",
            "advanced_relationships"
        ]
        for folder in folders:
            os.makedirs(os.path.join(report_path, folder), exist_ok=True)

    # ----------------- Helper Functions -----------------
    def save_text(self, folder, filename, content):
        with open(os.path.join(self.report_path, folder, filename), "w") as f:
            f.write(content)

    def set_target(self, target_name=None):
        if target_name is None:
            target_name = input("Enter target column name (or leave blank if none): ").strip()
        if target_name and target_name in self.df.columns:
            self.target_col = target_name
            print(f"[Info] Target column set to: {self.target_col}")
        else:
            self.target_col = None
            print("[Info] No target column set")

    # ----------------- 1. Missing Data Analysis -----------------
    def advanced_missing_data(self):
        missing = self.df.isnull().sum()
        missing_percent = (missing / len(self.df)) * 100
        summary = pd.DataFrame({"Missing Count": missing, "Missing %": missing_percent})
        summary = summary[summary["Missing Count"] > 0].sort_values(by="Missing %", ascending=False)
        self.save_text("missing_data", "missing_summary.txt", str(summary))

        # Heatmap
        plt.figure(figsize=(10, 6))
        sns.heatmap(self.df.isnull(), cbar=False)
        plt.title("Missing Data Heatmap")
        plt.savefig(os.path.join(self.report_path, "missing_data", "missing_heatmap.png"))
        plt.close()

        # Missing correlation (binary mask correlation)
        missing_corr = self.df.isnull().astype(int).corr()
        self.save_text("missing_data", "missing_correlation.txt", str(missing_corr))

        print("[Saved] Missing data summary, heatmap & correlation")

    # ----------------- 2. Cardinality Analysis -----------------
    def cardinality_analysis(self):
        cat_cols = self.df.select_dtypes(exclude=np.number).columns
        summary = {}
        high_cardinality = []
        for col in cat_cols:
            count = self.df[col].nunique()
            summary[col] = count
            if count > 50:  # Arbitrary threshold
                high_cardinality.append(col)
        text = f"Cardinality per categorical feature:\n{summary}\n\nHigh cardinality features (>50 unique): {high_cardinality}"
        self.save_text("cardinality", "cardinality_summary.txt", text)
        print("[Saved] Cardinality summary with high-cardinality flags")

    # ----------------- 3. Numerical Feature Relationships -----------------
    def numerical_relationships(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        if len(num_cols) <= 10 and len(num_cols) > 1:
            sns.pairplot(self.df[num_cols].dropna())
            plt.savefig(os.path.join(self.report_path, "relationships", "pairplot.png"))
            plt.close()
            print("[Saved] Pairplot for numerical features")

        # Boxplot of numeric vs categorical target
        if self.target_col and self.df[self.target_col].dtype == 'object':
            for col in num_cols:
                plt.figure(figsize=(6, 4))
                sns.boxplot(x=self.df[self.target_col], y=self.df[col])
                plt.title(f"{col} vs {self.target_col} (Boxplot)")
                plt.savefig(os.path.join(self.report_path, "relationships", f"{col}_vs_target_box.png"))
                plt.close()

        # Scatterplot vs continuous target
        if self.target_col and self.df[self.target_col].dtype != 'object':
            for col in num_cols:
                if col == self.target_col: continue
                plt.figure(figsize=(6, 4))
                sns.scatterplot(x=self.df[col], y=self.df[self.target_col])
                plt.title(f"{col} vs {self.target_col}")
                plt.savefig(os.path.join(self.report_path, "relationships", f"{col}_vs_target.png"))
                plt.close()

    # ----------------- 4. Categorical Feature Relationships -----------------
    def categorical_relationships(self):
        cat_cols = self.df.select_dtypes(exclude=np.number).columns
        chi_summary = []
        for col1, col2 in itertools.combinations(cat_cols, 2):
            table = pd.crosstab(self.df[col1], self.df[col2])
            chi2, p, _, _ = chi2_contingency(table)
            chi_summary.append(f"{col1} vs {col2} -> p={p:.4f}")
        self.save_text("relationships", "categorical_chi2.txt", "\n".join(chi_summary))

        # Cramér’s V
        cramer_results = []
        for col1, col2 in itertools.combinations(cat_cols, 2):
            table = pd.crosstab(self.df[col1], self.df[col2])
            chi2 = chi2_contingency(table)[0]
            n = table.sum().sum()
            phi2 = chi2 / n
            r, k = table.shape
            phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
            rcorr = r - ((r-1)**2)/(n-1)
            kcorr = k - ((k-1)**2)/(n-1)
            cramer_v = np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
            cramer_results.append(f"{col1} vs {col2} -> Cramér's V={cramer_v:.4f}")
        self.save_text("relationships", "categorical_cramers_v.txt", "\n".join(cramer_results))
        print("[Saved] Chi-square and Cramér's V results for categorical features")

    # ----------------- 5. Feature Importance (Mutual Information) -----------------
    def feature_importance(self):
        if self.target_col:
            X = self.df.drop(columns=[self.target_col]).select_dtypes(include=np.number).fillna(0)
            y = self.df[self.target_col]
            if y.dtype == 'object':
                y = y.astype('category').cat.codes
                scores = mutual_info_classif(X, y)
            else:
                scores = mutual_info_regression(X, y)
            mi_scores = pd.Series(scores, index=X.columns).sort_values(ascending=False)
            self.save_text("relationships", "mutual_information.txt", str(mi_scores))
            print("[Saved] Mutual Information scores")

    # ----------------- 6. Target Advanced Analysis -----------------
    def target_advanced(self):
        if self.target_col:
            # Class imbalance
            if self.df[self.target_col].dtype == 'object':
                counts = self.df[self.target_col].value_counts(normalize=True) * 100
                self.save_text("target_analysis", "class_imbalance.txt", str(counts))
            # Target mean per categorical
            cat_cols = self.df.select_dtypes(exclude=np.number).columns
            for col in cat_cols:
                if col != self.target_col:
                    mean_vals = self.df.groupby(col)[self.target_col].mean(numeric_only=True)
                    self.save_text("target_analysis", f"{col}_target_mean.txt", str(mean_vals))
            print("[Saved] Target distribution & mean analysis per category")

    # ----------------- 7. Correlation Beyond Pearson -----------------
    def advanced_correlations(self):
        num_df = self.df.select_dtypes(include=np.number)
        if num_df.shape[1] > 1:
            # Compute correlations
            spearman_corr = num_df.corr(method='spearman')
            kendall_corr = num_df.corr(method='kendall')

            # Save correlation matrices
            self.save_text("advanced_relationships", "spearman_corr.txt", str(spearman_corr))
            self.save_text("advanced_relationships", "kendall_corr.txt", str(kendall_corr))

            # Clean correlation matrix for clustering (replace NaN with 0)
            corr_matrix = num_df.corr().fillna(0)

            try:
                sns.clustermap(corr_matrix, annot=True, cmap='coolwarm')
                plt.savefig(os.path.join(self.report_path, "advanced_relationships", "clustered_corr_heatmap.png"))
                plt.close()
                print("[Saved] Spearman, Kendall correlations & clustered heatmap")
            except ValueError:
                print("[Warning] Could not generate clustered heatmap (NaNs/Infs in data even after fill).")
        else:
            print("[Info] Not enough numerical columns for advanced correlation analysis.")

    # ----------------- 8. Outlier Profiling -----------------
    def outlier_profiling(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        results = []
        for col in num_cols:
            Q1, Q3 = self.df[col].quantile([0.25, 0.75])
            IQR = Q3 - Q1
            lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
            outliers = self.df[(self.df[col] < lower) | (self.df[col] > upper)]
            results.append(f"{col}: {len(outliers)} outliers ({(len(outliers)/len(self.df))*100:.2f}%)")
            # Boxplot
            plt.figure(figsize=(6, 4))
            sns.boxplot(x=self.df[col])
            plt.title(f"Boxplot: {col}")
            plt.savefig(os.path.join(self.report_path, "outliers", f"box_{col}.png"))
            plt.close()

        # Multivariate outliers using Isolation Forest
        iso = IsolationForest(contamination=0.05)
        preds = iso.fit_predict(self.df[num_cols].fillna(0))
        outlier_count = np.sum(preds == -1)
        results.append(f"Multivariate Outliers (IsolationForest): {outlier_count}")

        self.save_text("outliers", "outlier_profile.txt", "\n".join(results))
        print("[Saved] Outlier profiles & multivariate outliers")

    # ----------------- 9. Distribution Shape -----------------
    def distribution_shape(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        results = []
        for col in num_cols:
            data = self.df[col].dropna()
            skew_val = skew(data)
            kurt_val = kurtosis(data)
            stat, p = shapiro(data.sample(min(5000, len(data))))
            results.append(f"{col}: Skew={skew_val:.2f}, Kurtosis={kurt_val:.2f}, Shapiro p={p:.4f}")
            # Histogram
            sns.histplot(data, kde=True)
            plt.title(f"Distribution: {col}")
            plt.savefig(os.path.join(self.report_path, "distribution_shape", f"dist_{col}.png"))
            plt.close()
        self.save_text("distribution_shape", "distribution_stats.txt", "\n".join(results))
        print("[Saved] Skewness, kurtosis & normality plots")

    # ----------------- 10. Dataset Integrity -----------------
    def integrity_checks(self):
        results = []
        # Constant features
        constant_cols = [col for col in self.df.columns if self.df[col].nunique() == 1]
        if constant_cols:
            results.append(f"Constant columns: {constant_cols}")

        # Highly correlated pairs
        num_df = self.df.select_dtypes(include=np.number)
        corr_matrix = num_df.corr().abs()
        high_corr = np.where((corr_matrix > 0.9) & (corr_matrix < 1))
        high_corr_pairs = [(num_df.columns[i], num_df.columns[j]) for i, j in zip(*high_corr)]
        if high_corr_pairs:
            results.append(f"Highly correlated pairs: {high_corr_pairs}")

        # Duplicate features
        duplicate_features = []
        for col1, col2 in itertools.combinations(self.df.columns, 2):
            if self.df[col1].equals(self.df[col2]):
                duplicate_features.append((col1, col2))
        if duplicate_features:
            results.append(f"Duplicate features: {duplicate_features}")

        self.save_text("relationships", "integrity_checks.txt", "\n".join(results) if results else "No issues found")
        print("[Saved] Integrity checks")

    # ----------------- Numerical & Categorical Plots -----------------
    def numerical_analysis(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        for col in num_cols:
            plt.figure(figsize=(8, 4))
            sns.histplot(self.df[col].dropna(), kde=True)
            plt.title(f"Distribution of {col}")
            plt.savefig(os.path.join(self.report_path, "numerical_analysis", f"hist_{col}.png"))
            plt.close()

            plt.figure(figsize=(6, 4))
            sns.boxplot(x=self.df[col].dropna())
            plt.title(f"Boxplot of {col}")
            plt.savefig(os.path.join(self.report_path, "numerical_analysis", f"box_{col}.png"))
            plt.close()
        print("[Saved] Numerical feature plots")

    def categorical_analysis(self):
        cat_cols = self.df.select_dtypes(exclude=np.number).columns
        for col in cat_cols:
            plt.figure(figsize=(8, 4))
            sns.countplot(y=self.df[col], order=self.df[col].value_counts().index)
            plt.title(f"Count Plot of {col}")
            plt.savefig(os.path.join(self.report_path, "categorical_analysis", f"count_{col}.png"))
            plt.close()
        print("[Saved] Categorical feature plots")
    
    def duplicate_rows_detail(self):
        duplicates = self.df[self.df.duplicated()]
        if not duplicates.empty:
            duplicates.to_csv(os.path.join(self.report_path, "relationships", "duplicate_rows.csv"), index=False)
            print(f"[Saved] Detailed duplicate rows ({len(duplicates)} rows)")
        else:
            print("[Info] No duplicate rows found")
    
    def data_imbalance_heatmap(self):
        if self.target_col and self.df[self.target_col].dtype == 'object':
            counts = self.df[self.target_col].value_counts(normalize=True) * 100
            plt.figure(figsize=(6, 4))
            sns.heatmap(counts.to_frame(), annot=True, fmt=".2f", cmap="coolwarm")
            plt.title(f"Class Imbalance Heatmap for {self.target_col}")
            plt.savefig(os.path.join(self.report_path, "target_analysis", "imbalance_heatmap.png"))
            plt.close()
            print("[Saved] Target imbalance heatmap")
        else:
            print("[Info] Target imbalance heatmap skipped (target not categorical)")

    def scaling_transformation_suggestions(self):
        num_cols = self.df.select_dtypes(include=np.number).columns
        suggestions = []

        for col in num_cols:
            data = self.df[col].dropna()
            if len(data) == 0: 
                continue

            # Skewness check
            skew_val = skew(data)
            suggestion = f"{col}: Skew={skew_val:.2f} -> "
            if abs(skew_val) > 1:
                suggestion += "Highly skewed; consider Log or Box-Cox transform"
            elif abs(skew_val) > 0.5:
                suggestion += "Moderately skewed; consider mild transform"
            else:
                suggestion += "Fairly symmetric"

            # Scaling suggestion
            if (data.max() - data.min()) > 1000:
                suggestion += " | Suggest MinMax Scaling (large range)"
            else:
                suggestion += " | Suggest Standard Scaling (normal range)"

            suggestions.append(suggestion)

        self.save_text("numerical_analysis", "scaling_transformation_suggestions.txt", "\n".join(suggestions))
        print("[Saved] Scaling and transformation suggestions")

    # ----------------- Run All -----------------
    def run_all(self, target_name=None):
        print("===== Starting EDA =====")
        self.set_target(target_name)
        self.advanced_missing_data()
        self.cardinality_analysis()
        self.numerical_analysis()
        self.categorical_analysis()
        self.numerical_relationships()
        self.categorical_relationships()
        self.feature_importance()
        self.target_advanced()
        self.advanced_correlations()
        self.outlier_profiling()
        self.distribution_shape()
        self.integrity_checks()
        self.duplicate_rows_detail()
        self.data_imbalance_heatmap()
        self.scaling_transformation_suggestions()

        summary = "EDA Completed.\nCheck folders for:\n"
        summary += "\n".join([
            "- missing_data", "- cardinality", "- numerical_analysis",
            "- categorical_analysis", "- relationships", "- outliers",
            "- distribution_shape", "- target_analysis", "- advanced_relationships"
        ])
        self.save_text("", "summary.txt", summary)
        print(f"\nEDA completed! Full report at '{self.report_path}'")

In [5]:
eda = EDA(df, report_path="Advanced_EDA_Results", outlier_method="zscore", threshold=3)
eda.run_all(target_name="label") 

===== Starting EDA =====
[Info] Target column set to: label
[Saved] Missing data summary, heatmap & correlation
[Saved] Cardinality summary with high-cardinality flags
[Saved] Numerical feature plots
[Saved] Categorical feature plots
[Saved] Pairplot for numerical features
[Saved] Chi-square and Cramér's V results for categorical features
[Saved] Mutual Information scores
[Saved] Target distribution & mean analysis per category
[Saved] Spearman, Kendall correlations & clustered heatmap
[Saved] Outlier profiles & multivariate outliers
[Saved] Skewness, kurtosis & normality plots
[Saved] Integrity checks
[Saved] Detailed duplicate rows (3033 rows)
[Info] Target imbalance heatmap skipped (target not categorical)
[Saved] Scaling and transformation suggestions

EDA completed! Full report at 'Advanced_EDA_Results'


In [6]:
def create_eda_collage(report_path="eda_results", output_file="eda_summary_collage.png", cols=3, image_size=(600, 400)):
    """
    Combine all .png images from EDA folders into one collage
    """
    # Find all images recursively
    image_paths = glob.glob(os.path.join(report_path, "**", "*.png"), recursive=True)
    
    if not image_paths:
        print("No images found to combine.")
        return

    # Resize all images to uniform size
    images = []
    for path in image_paths:
        img = Image.open(path).convert("RGB")
        img = img.resize(image_size)  # Resize for uniformity
        images.append(img)

    # Determine grid size
    total_images = len(images)
    rows = math.ceil(total_images / cols)
    width = cols * image_size[0]
    height = rows * image_size[1]

    # Create blank canvas
    collage = Image.new("RGB", (width, height), color=(255, 255, 255))

    # Paste images in grid
    for idx, img in enumerate(images):
        x = (idx % cols) * image_size[0]
        y = (idx // cols) * image_size[1]
        collage.paste(img, (x, y))

    # Save collage
    collage.save(os.path.join(report_path, output_file))
    print(f"[Saved] Collage created at: {os.path.join(report_path, output_file)}")


In [7]:
create_eda_collage(report_path="Advanced_EDA_Results", cols=4, image_size=(500, 350))


[Saved] Collage created at: Advanced_EDA_Results\eda_summary_collage.png


In [8]:
def create_text_summary(report_path="eda_results", output_file="eda_text_summary.txt"):
    """
    Combine all EDA text results into one summary file
    """
    text_paths = glob.glob(os.path.join(report_path, "**", "*.txt"), recursive=True)
    summary_content = []

    for path in sorted(text_paths):
        # Skip the final summary itself to avoid duplication
        if output_file in path:
            continue
        relative_path = os.path.relpath(path, report_path)
        summary_content.append(f"\n===== {relative_path} =====\n")
        with open(path, "r") as f:
            summary_content.append(f.read())

    with open(os.path.join(report_path, output_file), "w") as f:
        f.write("\n".join(summary_content))

    print(f"[Saved] Combined text summary at: {os.path.join(report_path, output_file)}")


In [9]:
create_text_summary(report_path="Advanced_EDA_Results", output_file="eda_text_summary.txt")

[Saved] Combined text summary at: Advanced_EDA_Results\eda_text_summary.txt


In [10]:
mlflow.set_tracking_uri("file:./mlruns")

In [31]:
class FeatureEngineering(BaseEstimator, TransformerMixin):
    """
    Custom feature engineering transformer.
    Controlled via steps_to_apply list:
    - 'feature_engineering': enable feature engineering
    - 'interaction': Category x PaymentMethod
    - 'ratio': paymentMethodAgeDays / accountAgeDays
    - 'binning': bins for accountAgeDays
    - 'time_feature': bins for localTime
    """
    def __init__(self, steps_to_apply=None):
        self.steps_to_apply = steps_to_apply or []

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Skip entire feature engineering if 'feature_engineering' not in steps
        if 'feature_engineering' not in self.steps_to_apply:
            return X

        # Interaction Feature
        if 'interaction' in self.steps_to_apply:
            if 'Category' in X.columns and 'paymentMethod' in X.columns:
                X['Category_Payment'] = X['Category'] + '_' + X['paymentMethod']

        # Ratio Feature
        if 'ratio' in self.steps_to_apply:
            if 'paymentMethodAgeDays' in X.columns and 'accountAgeDays' in X.columns:
                X['payment_account_ratio'] = X['paymentMethodAgeDays'] / (X['accountAgeDays'] + 1)

        # Binning Feature
        if 'binning' in self.steps_to_apply:
            if 'accountAgeDays' in X.columns:
                X['account_age_bin'] = pd.cut(
                    X['accountAgeDays'],
                    bins=[0, 90, 730, 2000],
                    labels=['new', 'medium', 'old']
                )

        # Time-of-day Feature
        if 'time_feature' in self.steps_to_apply:
            if 'localTime' in X.columns:
                bins = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]
                labels = ['early_morning', 'morning', 'afternoon', 'evening', 'night']
                X['time_of_day'] = pd.cut(X['localTime'], bins=bins, labels=labels)

        return X

In [32]:
# ----------- Log Transformer -----------
class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.log1p(np.array(X, dtype=float))

    def get_feature_names_out(self, input_features=None):
        return input_features


# ----------- Preprocessing Class -----------
class Preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_features, skewed_features, symmetric_features,
                 steps_to_apply=None, random_state=42):
        """
        Preprocessing pipeline (impute, encoding, log, scaling).
        SMOTE/ADASYN will be applied separately after this step.
        """
        self.categorical_features = categorical_features
        self.skewed_features = skewed_features
        self.symmetric_features = symmetric_features
        self.steps_to_apply = steps_to_apply or []
        self.random_state = random_state

        self.preprocessor = None  # ColumnTransformer will be built dynamically

    def _build_pipeline(self):
        """Build column transformer dynamically based on steps_to_apply."""

        # ----- Categorical pipeline -----
        cat_steps = []
        if 'impute' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            cat_steps.append(('imputer', SimpleImputer(strategy='most_frequent')))
        if 'encoding' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            cat_steps.append(('encoder', OneHotEncoder(handle_unknown='ignore', drop='first')))
        cat_pipeline = Pipeline(cat_steps) if cat_steps else 'passthrough'

        # ----- Skewed numerical pipeline -----
        skewed_steps = []
        if 'impute' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            skewed_steps.append(('imputer', SimpleImputer(strategy='median')))
        if 'log_transform' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            skewed_steps.append(('log', LogTransformer()))
        if 'encoding' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:  # scaling numeric
            skewed_steps.append(('scaler', StandardScaler()))
        skewed_pipeline = Pipeline(skewed_steps) if skewed_steps else 'passthrough'

        # ----- Symmetric numerical pipeline -----
        sym_steps = []
        if 'impute' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            sym_steps.append(('imputer', SimpleImputer(strategy='median')))
        if 'encoding' in self.steps_to_apply or 'preprocessing' in self.steps_to_apply:
            sym_steps.append(('scaler', MinMaxScaler()))
        sym_pipeline = Pipeline(sym_steps) if sym_steps else 'passthrough'

        # ----- Combine all pipelines -----
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('cat', cat_pipeline, self.categorical_features),
                ('skew', skewed_pipeline, self.skewed_features),
                ('sym', sym_pipeline, self.symmetric_features)
            ],
            remainder='drop'
        )

    def fit(self, X, y=None):
        self._build_pipeline()
        self.preprocessor.fit(X)
        return self

    def transform(self, X):
        """Transform WITHOUT resampling (pure preprocessing)."""
        return self.preprocessor.transform(X)

    def fit_transform(self, X, y=None):
        """Fit and transform WITHOUT resampling (SMOTE will be handled outside)."""
        self._build_pipeline()
        return self.preprocessor.fit_transform(X)

In [58]:
# ------------------------- FRAUD PIPELINE CLASS -------------------------
class FraudPipeline:
    # ----- Step mappings -----
    FEATURE_ENG_SUBSTEPS = ['interaction', 'ratio', 'binning', 'time_feature']
    PREPROCESS_SUBSTEPS = ['encoding', 'impute', 'log_transform', 'smote']
    
    def __init__(self, steps_to_apply=None, model=None, test_size=0.2,
                 random_state=42, resample_method="smote", experiment_name="FraudDetection"):
        self.steps_to_apply = self.expand_steps(steps_to_apply)
        self.model = model or RandomForestClassifier(class_weight='balanced', random_state=random_state)
        self.test_size = test_size
        self.random_state = random_state
        self.resample_method = resample_method
        self.experiment_name = experiment_name

        # Pipeline placeholder
        self.pipeline = None

        # Feature groups
        self.categorical = ['Category', 'paymentMethod', 'isWeekend']
        self.skewed = ['numItems', 'localTime', 'paymentMethodAgeDays']
        self.symmetric = ['accountAgeDays']
        self.target = 'label'

        # Initialize MLflow experiment
        mlflow.set_experiment(self.experiment_name)

    # ------------------------- STEP EXPANSION -------------------------
    def expand_steps(self, steps_to_apply):
        """Strict parent-substep logic with full-step auto expansion."""
        steps = set(steps_to_apply or [])

        # ---------- Feature Engineering ----------
        fe_substeps = steps.intersection(self.FEATURE_ENG_SUBSTEPS)

        if fe_substeps and 'feature_engineering' not in steps:
            raise ValueError(
                f"Feature engineering sub-steps {fe_substeps} provided without 'feature_engineering' parent step."
            )

        if 'feature_engineering' in steps:
            if fe_substeps:
                steps = steps  # Only chosen sub-steps
            else:
                steps.update(self.FEATURE_ENG_SUBSTEPS)  # Apply all sub-steps

        # ---------- Preprocessing ----------
        pre_substeps = steps.intersection(self.PREPROCESS_SUBSTEPS)

        if pre_substeps and 'preprocessing' not in steps:
            raise ValueError(
                f"Preprocessing sub-steps {pre_substeps} provided without 'preprocessing' parent step."
            )

        if 'preprocessing' in steps:
            if pre_substeps:
                steps = steps
            else:
                steps.update(self.PREPROCESS_SUBSTEPS)  # Apply all preprocessing steps

        return list(steps)

    # ------------------------- TRAIN -------------------------
    def train(self, df):
        """Fit full pipeline with no data leakage and log experiment metadata."""

        # --- Split ---
        X = df.drop(columns=[self.target])
        y = df[self.target]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, stratify=y, test_size=self.test_size, random_state=self.random_state, shuffle=True
        )

        # # --- Remove duplicates from training set only ---
        # train_data = pd.concat([X_train, y_train], axis=1)
        # train_data = train_data.drop_duplicates()
        # X_train = train_data.drop(columns=[self.target])
        # y_train = train_data[self.target]
        # print(len(train_data[train_data.duplicated()]))

        # --- Feature Engineering Step ---
        feature_engineer = FeatureEngineering(steps_to_apply=self.steps_to_apply) \
            if any(s in self.steps_to_apply for s in self.FEATURE_ENG_SUBSTEPS) else 'passthrough'

        # --- Preprocessing Step ---
        preprocessor = Preprocessing(
            self.categorical, self.skewed, self.symmetric,
            steps_to_apply=self.steps_to_apply,
            random_state=self.random_state
        ) if any(s in self.steps_to_apply for s in self.PREPROCESS_SUBSTEPS) else 'passthrough'

        # --- Build pipeline (no resampling in pipeline) ---
        self.pipeline = ImbPipeline([
            ('feature_engineering', feature_engineer),
            ('preprocessing', preprocessor),
            ('model', self.model)
        ])

        # --- Transform Train/Test separately (no leakage) ---
        X_train_transformed = self.pipeline[:-1].fit_transform(X_train, y_train)
        X_test_transformed = self.pipeline[:-1].transform(X_test)

        # --- Apply SMOTE/ADASYN only on Train ---
        if 'smote' in self.steps_to_apply:
            if self.resample_method == "smote":
                sampler = SMOTE(random_state=self.random_state, sampling_strategy='minority', k_neighbors=5)
            elif self.resample_method == "adasyn":
                sampler = ADASYN(random_state=self.random_state, sampling_strategy='minority', n_neighbors=5)
            else:
                raise ValueError("resample_method must be 'smote' or 'adasyn'")

            X_train_transformed, y_train = sampler.fit_resample(X_train_transformed, y_train)

        # --- Train Model ---
        if 'model_training' in self.steps_to_apply:
            with mlflow.start_run(run_name=f"{type(self.model).__name__}_run"):
                # -------- Log Parameters --------
                mlflow.log_param("steps_to_apply", self.steps_to_apply)
                mlflow.log_param("resample_method", self.resample_method)
                mlflow.log_param("test_size", self.test_size)
                mlflow.log_param("model", type(self.model).__name__)
                mlflow.log_param("categorical_features", self.categorical)
                mlflow.log_param("skewed_features", self.skewed)
                mlflow.log_param("symmetric_features", self.symmetric)

                # -------- Train and Evaluate --------
                self.model.fit(X_train_transformed, y_train)
                y_train_pred = self.model.predict(X_train_transformed)
                y_test_pred = self.model.predict(X_test_transformed)

                # Log metrics
                train_metrics = self._calculate_metrics(y_train, y_train_pred, prefix="train")
                test_metrics = self._calculate_metrics(y_test, y_test_pred, prefix="test")
                self._log_metrics(train_metrics)
                self._log_metrics(test_metrics)

                # Confusion matrix artifact (PNG)
                cm = confusion_matrix(y_test, y_test_pred)
                fig, ax = plt.subplots(figsize=(6, 4))
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
                ax.set_title('Confusion Matrix')
                ax.set_xlabel('Predicted')
                ax.set_ylabel('Actual')

                cm_png_path = "artifacts/confusion_matrix.png"
                os.makedirs("artifacts", exist_ok=True)
                plt.savefig(cm_png_path, bbox_inches='tight')
                plt.close(fig)

                # Log confusion matrix image to MLflow
                mlflow.log_artifact(cm_png_path, "confusion_matrix")

                # Log data distribution
                class_dist = y.value_counts(normalize=True).to_dict()
                mlflow.log_param("class_distribution", json.dumps(class_dist))

                # Log signature
                signature = infer_signature(X_train_transformed, self.model.predict(X_train_transformed))
                mlflow.sklearn.log_model(self.pipeline, name="fraud_pipeline", signature=signature)

                # Save pipeline
                joblib.dump(self.pipeline, "artifacts/fraud_pipeline.pkl")
                mlflow.log_artifacts("artifacts")

        return self.pipeline, X_train_transformed, y_train, X_test_transformed, y_test

    # ------------------------- PREPROCESS -------------------------
    def preprocess(self, df):
        if not self.pipeline:
            raise ValueError("Pipeline not trained. Train first or load fitted pipeline.")
        return self.pipeline[:-1].transform(df)

    # ------------------------- PREDICT -------------------------
    def predict(self, df):
        if not self.pipeline:
            raise ValueError("Pipeline not trained. Train first or load fitted pipeline.")
        return self.pipeline.predict(df)


    # ------------------------- METRICS UTILS -------------------------
    def _calculate_metrics(self, y_true, y_pred, prefix=""):
        return {
            f"{prefix}_accuracy": accuracy_score(y_true, y_pred),
            f"{prefix}_precision": precision_score(y_true, y_pred),
            f"{prefix}_recall": recall_score(y_true, y_pred),
            f"{prefix}_f1": f1_score(y_true, y_pred)
        }

    def _log_metrics(self, metrics):
        for k, v in metrics.items():
            mlflow.log_metric(k, v)

In [61]:
def evaluate_model(y_true, y_pred, dataset_name="Evaluation"):
    """
    Prints and returns classification metrics.
    
    Parameters
    ----------
    y_true : array-like
        True labels.
    y_pred : array-like
        Predicted labels.
    dataset_name : str
        Name for the dataset (used in print titles and keys).
    """
    # Print full classification report
    print(f"\n--- {dataset_name} Classification Report ---")
    print(classification_report(y_true, y_pred))

    # Compute and print summary metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"{dataset_name} Accuracy:  {accuracy:.4f}")
    print(f"{dataset_name} Precision: {precision:.4f}")
    print(f"{dataset_name} Recall:    {recall:.4f}")
    print(f"{dataset_name} F1 Score:   {f1:.4f}")

    # Return metrics as a dictionary
    return {
        f"{dataset_name}_accuracy": accuracy,
        f"{dataset_name}_precision": precision,
        f"{dataset_name}_recall": recall,
        f"{dataset_name}_f1": f1,
    }


# ----------------------- Step 1: Create Hold-out Set -----------------------
df_clean = df.drop_duplicates(keep='first')

n1, n2 = 99, 1
holdout_class_0 = df_clean[df_clean['label'] == 0].sample(n1, random_state=42)
holdout_class_1 = df_clean[df_clean['label'] == 1].sample(n2, random_state=42)
holdout_df = pd.concat([holdout_class_0, holdout_class_1])

train_df = df_clean.drop(holdout_df.index)

X_holdout = holdout_df.drop(columns=['label'])
y_holdout = holdout_df['label']

holdout_class_0_B = train_df[train_df['label'] == 0].sample(n1, random_state=42)
holdout_class_1_B = train_df[train_df['label'] == 1].sample(n2, random_state=42)
holdout_df_B = pd.concat([holdout_class_0_B, holdout_class_1_B])

train_df = train_df.drop(holdout_df_B.index)

X_holdout_B = holdout_df_B.drop(columns=['label'])
y_holdout_B = holdout_df_B['label']

print(f"Training data size: {train_df.shape}")
print(f"Hold-out data size: {holdout_df.shape}")
print(f"Hold-out data size: {holdout_df_B.shape}")

# ----------------------- Step 2: Initialize FraudPipeline -----------------------
fp = FraudPipeline(
    steps_to_apply=[
        'feature_engineering', 
        # 'interaction', 
        # 'ratio', 
        # 'binning', 
        # 'time_feature',
        
        'preprocessing', 
        # 'encoding', 
        # 'impute', 
        # 'log_transform', 
        # 'smote', 
        
        'model_training',
    ],
    # resample_method='smote',
    # model=xgb.XGBClassifier(),
    model=LogisticRegression(),
)

# ----------------------- Step 3: Train Pipeline -----------------------
pipeline, X_train, y_train, X_test, y_test = fp.train(train_df)

# ----------------------- Step 4: Evaluate on Internal Test Split -----------------------
test_predictions = fp.model.predict(X_test)
test_metrics = evaluate_model(y_test, test_predictions, dataset_name="Internal Test Split")

# ----------------------- Step 5: Evaluate on Hold-out Set -----------------------
holdout_predictions = fp.predict(X_holdout)
holdout_metrics = evaluate_model(y_holdout, holdout_predictions, dataset_name="Hold-out Set")
holdout_predictions_B = fp.predict(X_holdout_B)
holdout_metrics_B = evaluate_model(y_holdout_B, holdout_predictions_B, dataset_name="Hold-out Set B")

# ----------------------- Step 6: Save Pipeline -----------------------
joblib.dump(fp.pipeline, "fraud_pipeline_deployed.pkl")
print("\nPipeline saved as 'fraud_pipeline_deployed.pkl'")

# ----------------------- Step 7: Load & Predict (Example) -----------------------
loaded_pipeline = joblib.load("fraud_pipeline_deployed.pkl")
sample_preds = loaded_pipeline.predict(X_holdout)
sample_preds_B = loaded_pipeline.predict(X_holdout_B)

display(pd.DataFrame({'Predictions': sample_preds, 'True_Value': y_holdout}).head(100))
display(pd.DataFrame({'Predictions': sample_preds_B, 'True_Value': y_holdout_B}).head(100))

Training data size: (35988, 8)
Hold-out data size: (100, 8)
Hold-out data size: (100, 8)

--- Internal Test Split Classification Report ---
              precision    recall  f1-score   support

           0       1.00      0.92      0.96      7119
           1       0.13      1.00      0.23        79

    accuracy                           0.92      7198
   macro avg       0.56      0.96      0.59      7198
weighted avg       0.99      0.92      0.95      7198

Internal Test Split Accuracy:  0.9246
Internal Test Split Precision: 0.1270
Internal Test Split Recall:    1.0000
Internal Test Split F1 Score:   0.2254

--- Hold-out Set Classification Report ---
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        99
           1       0.14      1.00      0.25         1

    accuracy                           0.94       100
   macro avg       0.57      0.97      0.61       100
weighted avg       0.99      0.94      0.96       100

Hold-out 

Unnamed: 0,Predictions,True_Value
35999,0,0
18192,0,0
5589,0,0
18168,0,0
24727,0,0
...,...,...
25217,0,0
8568,0,0
7560,0,0
28968,0,0


Unnamed: 0,Predictions,True_Value
6153,0,0
660,0,0
26114,0,0
35825,0,0
13314,0,0
...,...,...
19651,0,0
13654,0,0
9782,0,0
13045,0,0


In [60]:
# Load the serialized pipeline
loaded_pipeline = joblib.load("artifacts/fraud_pipeline.pkl")
# loaded_pipeline = joblib.load("fraud_pipeline_deployed.pkl")

# Use it directly to predict
sample_data = df.drop(['label'], axis=1)   # Or new unseen data
predictions = loaded_pipeline.predict(sample_data)

print(predictions)
loaded_pipeline

[0 0 0 ... 0 0 0]


In [36]:
# --- Split your dataset ---
X = df.drop(columns=['label'])
y = df['label']

# Instantiate FeatureEngineering
fe = FeatureEngineering(steps_to_apply=['feature_engineering', 'interaction', 'ratio', 'binning', 'time_feature'])

# BEFORE: Inspect original columns
print("Columns BEFORE Feature Engineering:")
print(list(X.columns))

# Apply Feature Engineering
X_transformed = fe.fit_transform(X)

# AFTER: Inspect new columns
print("\nColumns AFTER Feature Engineering:")
print(list(X_transformed.columns))

# Compare column difference
added_columns = set(X_transformed.columns) - set(X.columns)
print("\nNewly added columns:", added_columns)

# Show head of transformed data
print("\nSample transformed data:")
display(X_transformed.head())

Columns BEFORE Feature Engineering:
['accountAgeDays', 'numItems', 'localTime', 'paymentMethod', 'paymentMethodAgeDays', 'Category', 'isWeekend']

Columns AFTER Feature Engineering:
['accountAgeDays', 'numItems', 'localTime', 'paymentMethod', 'paymentMethodAgeDays', 'Category', 'isWeekend', 'Category_Payment', 'payment_account_ratio', 'account_age_bin', 'time_of_day']

Newly added columns: {'time_of_day', 'Category_Payment', 'account_age_bin', 'payment_account_ratio'}

Sample transformed data:


Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend,Category_Payment,payment_account_ratio,account_age_bin,time_of_day
0,29,1,4.745402,paypal,28.204861,shopping,0.0,shopping_paypal,0.940162,new,night
1,725,1,4.742303,storecredit,0.0,electronics,0.0,electronics_storecredit,0.0,medium,night
2,845,1,4.921318,creditcard,0.0,food,1.0,food_creditcard,0.0,old,night
3,503,1,4.886641,creditcard,0.0,electronics,1.0,electronics_creditcard,0.0,medium,night
4,2000,1,5.040929,creditcard,0.0,shopping,0.0,shopping_creditcard,0.0,old,night


In [37]:
def get_feature_names_from_column_transformer(column_transformer):
    output_features = []
    for name, pipe, features in column_transformer.transformers_:
        if name != 'remainder':
            if hasattr(pipe, 'named_steps'):
                last_step = list(pipe.named_steps.values())[-1]
                if hasattr(last_step, 'get_feature_names_out'):
                    feature_names = last_step.get_feature_names_out(features)
                else:
                    feature_names = features
            else:
                feature_names = features
            output_features.extend(feature_names)
        else:
            output_features.extend(features)  # passthrough features
    return output_features

# Assume you have X (original features) and Preprocessing object
pre = Preprocessing(
    categorical_features=['Category', 'paymentMethod', 'isWeekend'],
    skewed_features=['numItems', 'localTime', 'paymentMethodAgeDays'],
    symmetric_features=['accountAgeDays'],
    steps_to_apply=['preprocessing']  # full preprocessing
)

# BEFORE: original data
print("Original Columns:", X.columns.tolist())
display(X.head())

# Fit + transform
X_transformed = pre.fit_transform(X)

# Get transformed column names
transformed_cols = get_feature_names_from_column_transformer(pre.preprocessor)

# AFTER: transformed data
print("\nTransformed Shape:", X_transformed.shape)
print("Transformed Columns (after encoding, scaling, log):")
print(transformed_cols)

# Convert transformed array to DataFrame for easy inspection
X_transformed_df = pd.DataFrame(X_transformed, columns=transformed_cols)

# Show first few rows
display(X_transformed_df.head())


Original Columns: ['accountAgeDays', 'numItems', 'localTime', 'paymentMethod', 'paymentMethodAgeDays', 'Category', 'isWeekend']


Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend
0,29,1,4.745402,paypal,28.204861,shopping,0.0
1,725,1,4.742303,storecredit,0.0,electronics,0.0
2,845,1,4.921318,creditcard,0.0,food,1.0
3,503,1,4.886641,creditcard,0.0,electronics,1.0
4,2000,1,5.040929,creditcard,0.0,shopping,0.0



Transformed Shape: (39221, 9)
Transformed Columns (after encoding, scaling, log):
['Category_food', 'Category_shopping', 'paymentMethod_paypal', 'paymentMethod_storecredit', 'isWeekend_1.0', 'numItems', 'localTime', 'paymentMethodAgeDays', 'accountAgeDays']


Unnamed: 0,Category_food,Category_shopping,paymentMethod_paypal,paymentMethod_storecredit,isWeekend_1.0,numItems,localTime,paymentMethodAgeDays,accountAgeDays
0,0.0,1.0,1.0,0.0,0.0,-0.189142,0.028772,0.557839,0.014007
1,0.0,0.0,0.0,1.0,0.0,-0.189142,0.021742,-0.775564,0.362181
2,1.0,0.0,0.0,0.0,1.0,-0.189142,0.421745,-0.775564,0.422211
3,0.0,0.0,0.0,0.0,1.0,-0.189142,0.345213,-0.775564,0.251126
4,0.0,1.0,0.0,0.0,0.0,-0.189142,0.682328,-0.775564,1.0


In [40]:
def get_preprocessed_dataframe(preprocessor_obj, X):
    """
    Returns preprocessed dataframe with column names after transformation.
    """
    # Fit and transform
    X_transformed = preprocessor_obj.fit_transform(X)
    
    # Extract feature names
    feature_names = get_feature_names_from_column_transformer(preprocessor_obj.preprocessor)
    
    # Convert to DataFrame
    preprocessed_df = pd.DataFrame(X_transformed, columns=feature_names, index=X.index)
    
    return preprocessed_df


# ---------- Usage ----------
# Initialize Preprocessing object
pre = Preprocessing(
    categorical_features=['Category', 'paymentMethod', 'isWeekend'],
    skewed_features=['numItems', 'localTime', 'paymentMethodAgeDays'],
    symmetric_features=['accountAgeDays'],
    steps_to_apply=['preprocessing']  # includes impute, encoding, scaling, log
)

# Get preprocessed DataFrame
preprocessed_df = get_preprocessed_dataframe(pre, X)

# Show
print("Preprocessed columns:")
print(preprocessed_df.columns.tolist())
display(preprocessed_df.head())

Preprocessed columns:
['Category_food', 'Category_shopping', 'paymentMethod_paypal', 'paymentMethod_storecredit', 'isWeekend_1.0', 'numItems', 'localTime', 'paymentMethodAgeDays', 'accountAgeDays']


Unnamed: 0,Category_food,Category_shopping,paymentMethod_paypal,paymentMethod_storecredit,isWeekend_1.0,numItems,localTime,paymentMethodAgeDays,accountAgeDays
0,0.0,1.0,1.0,0.0,0.0,-0.189142,0.028772,0.557839,0.014007
1,0.0,0.0,0.0,1.0,0.0,-0.189142,0.021742,-0.775564,0.362181
2,1.0,0.0,0.0,0.0,1.0,-0.189142,0.421745,-0.775564,0.422211
3,0.0,0.0,0.0,0.0,1.0,-0.189142,0.345213,-0.775564,0.251126
4,0.0,1.0,0.0,0.0,0.0,-0.189142,0.682328,-0.775564,1.0


In [42]:
# Initialize FeatureEngineering with chosen steps
fe = FeatureEngineering(
    steps_to_apply=['feature_engineering', 'interaction', 'ratio', 'binning', 'time_feature']
)

# Apply feature engineering
X_fe = fe.fit_transform(X)

print("Feature Engineered Columns:")
print(X_fe.columns.tolist())
display(X_fe.head())

# Initialize Preprocessing with chosen steps
pre = Preprocessing(
    categorical_features=['Category', 'paymentMethod', 'isWeekend', 'Category_Payment', 'account_age_bin', 'time_of_day'],
    skewed_features=['numItems', 'localTime', 'paymentMethodAgeDays'],
    symmetric_features=['accountAgeDays', 'payment_account_ratio'],
    steps_to_apply=['preprocessing']  # includes impute, encoding, log, scaling
)

# Fit & transform feature engineered data
X_preprocessed = pre.fit_transform(X_fe)

# Extract transformed feature names
feature_names = get_feature_names_from_column_transformer(pre.preprocessor)

# Convert to DataFrame
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names, index=X_fe.index)

print("\nPreprocessed Columns:")
print(X_preprocessed_df.columns.tolist())
display(X_preprocessed_df.head())

Feature Engineered Columns:
['accountAgeDays', 'numItems', 'localTime', 'paymentMethod', 'paymentMethodAgeDays', 'Category', 'isWeekend', 'Category_Payment', 'payment_account_ratio', 'account_age_bin', 'time_of_day']


Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,Category,isWeekend,Category_Payment,payment_account_ratio,account_age_bin,time_of_day
0,29,1,4.745402,paypal,28.204861,shopping,0.0,shopping_paypal,0.940162,new,night
1,725,1,4.742303,storecredit,0.0,electronics,0.0,electronics_storecredit,0.0,medium,night
2,845,1,4.921318,creditcard,0.0,food,1.0,food_creditcard,0.0,old,night
3,503,1,4.886641,creditcard,0.0,electronics,1.0,electronics_creditcard,0.0,medium,night
4,2000,1,5.040929,creditcard,0.0,shopping,0.0,shopping_creditcard,0.0,old,night



Preprocessed Columns:
['Category_food', 'Category_shopping', 'paymentMethod_paypal', 'paymentMethod_storecredit', 'isWeekend_1.0', 'Category_Payment_electronics_paypal', 'Category_Payment_electronics_storecredit', 'Category_Payment_food_creditcard', 'Category_Payment_food_paypal', 'Category_Payment_food_storecredit', 'Category_Payment_shopping_creditcard', 'Category_Payment_shopping_paypal', 'Category_Payment_shopping_storecredit', 'account_age_bin_new', 'account_age_bin_old', 'time_of_day_evening', 'time_of_day_night', 'numItems', 'localTime', 'paymentMethodAgeDays', 'accountAgeDays', 'payment_account_ratio']


Unnamed: 0,Category_food,Category_shopping,paymentMethod_paypal,paymentMethod_storecredit,isWeekend_1.0,Category_Payment_electronics_paypal,Category_Payment_electronics_storecredit,Category_Payment_food_creditcard,Category_Payment_food_paypal,Category_Payment_food_storecredit,...,Category_Payment_shopping_storecredit,account_age_bin_new,account_age_bin_old,time_of_day_evening,time_of_day_night,numItems,localTime,paymentMethodAgeDays,accountAgeDays,payment_account_ratio
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,-0.189142,0.028772,0.557839,0.014007,0.940649
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.189142,0.021742,-0.775564,0.362181,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,-0.189142,0.421745,-0.775564,0.422211,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.189142,0.345213,-0.775564,0.251126,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,-0.189142,0.682328,-0.775564,1.0,0.0
