In [20]:
# ============================
# ONE-CLASS SVM (TRAIN + FEATURE IMPORTANCE VIA PERMUTATION + PENDING PREDICT)
# Uses SAME:
# - preprocess_p2p()
# - feature_columns.joblib
# - invoice_amt_bins.joblib
# ============================

import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.inspection import permutation_importance


# ==========================================================
# PATHS (single source of truth)
# ==========================================================
DATA_PATH = "D:/EY_Internship/p2p07_synthetic_10k_imbalance_with_pending.csv"

FEATURE_COLUMNS_PATH = "D:/EY_Internship/feature_columns.joblib"
INVOICE_BINS_PATH = "D:/EY_Internship/invoice_amt_bins.joblib"

# Saved model bundle (scaler + ocsvm + threshold)
OCSVM_BUNDLE_PATH = "ocsvm_bundle.joblib"

# Permutation importance output
OCSVM_FEATURE_IMPORTANCE_PATH = "ocsvm_feature_importances_perm.csv"

# Pending I/O
PENDING_INPUT_PATH = "D:/EY_Internship/pending_for_review_100_rows_clean.csv"
PENDING_OUTPUT_PATH = "pending_with_predictions_ocsvm.csv"




In [21]:
def preprocess_p2p(
    df: pd.DataFrame,
    *,
    feature_columns: list[str] | None = None,
    invoice_amt_bins: np.ndarray | None = None,
    is_training: bool = False
):
    df = df.copy()

    df["Invoice_Amount"] = pd.to_numeric(df.get("Invoice_Amount"), errors="coerce")
    df = df.dropna(subset=["Invoice_Amount"])

    if is_training:
        _, bins = pd.qcut(df["Invoice_Amount"], q=5, retbins=True, duplicates="drop")
        invoice_amt_bins = bins

    if invoice_amt_bins is not None:
        labels = ["Very_Low", "Low", "Medium", "High", "Very_High"]
        effective_k = len(invoice_amt_bins) - 1
        effective_labels = labels[:effective_k]

        df["Invoice_Amt_Bucket"] = pd.cut(
            df["Invoice_Amount"],
            bins=invoice_amt_bins,
            labels=effective_labels,
            include_lowest=True
        )
        df["Invoice_Amt_Bucket_Code"] = df["Invoice_Amt_Bucket"].cat.codes.replace(-1, np.nan)
    else:
        df["Invoice_Amt_Bucket"] = pd.qcut(
            df["Invoice_Amount"], q=5,
            labels=["Very_Low", "Low", "Medium", "High", "Very_High"],
            duplicates="drop"
        )
        df["Invoice_Amt_Bucket_Code"] = pd.qcut(
            df["Invoice_Amount"], q=5, labels=False, duplicates="drop"
        )

    df = pd.get_dummies(df, columns=["Invoice_Amt_Bucket"], drop_first=True)

    df["Invoice_Date"] = pd.to_datetime(df.get("Invoice_Date"), errors="coerce", dayfirst=True)
    df["Posting_Date"] = pd.to_datetime(df.get("Posting_Date"), errors="coerce", dayfirst=True)

    dup_lookup = df[["Doc_id", "Invoice_Amount", "Invoice_Date"]].rename(
        columns={
            "Doc_id": "Duplicate_doc_id",
            "Invoice_Amount": "Dup_Invoice_Amount",
            "Invoice_Date": "Dup_Invoice_Date",
        }
    )
    df = df.merge(dup_lookup, on="Duplicate_doc_id", how="left")

    df["Invoice_Amt_Diff"] = (df["Invoice_Amount"] - df["Dup_Invoice_Amount"]).abs()
    df["Invoice_Date_Diff_Days"] = (df["Invoice_Date"] - df["Dup_Invoice_Date"]).dt.days.abs()
    df["Invoice_Amt_Diff_Ratio"] = df["Invoice_Amt_Diff"] / (df["Invoice_Amount"] + 1)

    def date_diff_bucket(x):
        if pd.isna(x):
            return np.nan
        if x == 0:
            return "Same_Day"
        elif x <= 2:
            return "1_2_Days"
        else:
            return "Above_2_Days"

    df["Invoice_Date_Diff_Bucket"] = df["Invoice_Date_Diff_Days"].apply(date_diff_bucket)

    DROP_COLS = [
        "Doc_id", "Invoice_ID", "Vendor_Code", "Vendor_Name",
        "Vendor_GSTIN", "Invoice_Number", "Currency", "comments",
        "Duplicate_doc_id", "Exception_Type",
        "Invoice_Amt_Bucket", "Invoice_Amount", "Dup_Invoice_Amount",
    ]
    df_model = df.drop(columns=DROP_COLS, errors="ignore")

    categorical_cols = [
        "Invoice_Type",
        "Invoice_Source",
        "Business_Unit",
        "Exception_Number",
        "Invoice_Date_Diff_Bucket",
    ]
    df_model = pd.get_dummies(df_model, columns=categorical_cols, drop_first=True)

    if is_training:
        feature_columns = df_model.columns.tolist()
    else:
        if feature_columns is None:
            raise ValueError("feature_columns is required for inference.")
        for c in feature_columns:
            if c not in df_model.columns:
                df_model[c] = 0
        extra = [c for c in df_model.columns if c not in feature_columns]
        if extra:
            df_model = df_model.drop(columns=extra)
        df_model = df_model[feature_columns]

    return df_model, feature_columns, invoice_amt_bins

In [22]:
# ==========================================================
# IMPORTANT:
# You already have preprocess_p2p() above in your notebook/script.
# Do NOT redefine it here if it's already defined.
# ==========================================================


# ==========================================================
# Load raw + artifacts
# ==========================================================
df = pd.read_csv(DATA_PATH)
feature_columns = joblib.load(FEATURE_COLUMNS_PATH)

# IMPORTANT: reuse the SAME bins you saved during BRF training
invoice_amt_bins = joblib.load(INVOICE_BINS_PATH)

# ==========================================================
# Preprocess full dataset aligned to schema
# ==========================================================
df_model, _, _ = preprocess_p2p(
    df,
    feature_columns=feature_columns,
    invoice_amt_bins=invoice_amt_bins,
    is_training=False
)



In [23]:
# ==========================================================
# Build labeled train set (TRUE/FALSE only) + y mapping
# ==========================================================
mask_tf = df["user_action_status"].isin(["TRUE", "FALSE"])
X_all = df_model.loc[mask_tf, feature_columns].copy()

y_all = df.loc[mask_tf, "user_action_status"].map({"FALSE": 0, "TRUE": 1}).astype(int)

# Train/test split for evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)


In [24]:

# ==========================================================
# One-Class SVM Training
# Train ONLY on "normal" class (inliers).
# Here we assume TRUE (1) = normal / clean.
# If FALSE is normal in your case, change y_train==1 to y_train==0
# ==========================================================
X_inliers = X_train[y_train == 1].copy()

scaler = StandardScaler()
X_inliers_sc = scaler.fit_transform(X_inliers)
X_test_sc = scaler.transform(X_test)

ocsvm = OneClassSVM(
    kernel="rbf",
    nu=0.05,       # expected outlier fraction among inliers (tune)
    gamma="scale"
)
ocsvm.fit(X_inliers_sc)



In [25]:
# ==========================================================
# Evaluation on held-out TRUE/FALSE
# OneClassSVM outputs:
#   +1 = inlier, -1 = outlier
# We map:
#   inlier -> 1 (TRUE), outlier -> 0 (FALSE)
# decision_function: higher = more "normal" (more TRUE-like)
# ==========================================================
raw_pred = ocsvm.predict(X_test_sc)
pred = (raw_pred == 1).astype(int)

score = ocsvm.decision_function(X_test_sc)  # higher = more normal

print("\n=== One-Class SVM (trained on TRUE as inliers) ===")
print("Confusion Matrix:\n", confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, digits=4))
print("ROC-AUC (TRUE as positive):", roc_auc_score(y_test, score))




=== One-Class SVM (trained on TRUE as inliers) ===
Confusion Matrix:
 [[ 426 1014]
 [  26  134]]
              precision    recall  f1-score   support

           0     0.9425    0.2958    0.4503      1440
           1     0.1167    0.8375    0.2049       160

    accuracy                         0.3500      1600
   macro avg     0.5296    0.5667    0.3276      1600
weighted avg     0.8599    0.3500    0.4258      1600

ROC-AUC (TRUE as positive): 0.5222092013888889


In [26]:
# ==========================================================
# Choose a decision threshold for pending scoring
# We'll convert decision_function -> probability-like (0..1) using sigmoid,
# then threshold at 0.6 (like your BRF example).
# This is NOT a calibrated probability, but useful for ranking.
# ==========================================================
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

prob_like = sigmoid(score)

# Default threshold (tune this)
THRESH = 0.6

pred_thresh = (prob_like >= THRESH).astype(int)

print("\n=== One-Class SVM (sigmoid(score) thresholded) ===")
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_thresh))
print(classification_report(y_test, pred_thresh, digits=4))
print("ROC-AUC (using prob_like):", roc_auc_score(y_test, prob_like))

# Save bundle
joblib.dump(
    {"scaler": scaler, "ocsvm": ocsvm, "threshold": THRESH},
    OCSVM_BUNDLE_PATH
)
print("\nSaved:", OCSVM_BUNDLE_PATH)




=== One-Class SVM (sigmoid(score) thresholded) ===
Confusion Matrix:
 [[1230  210]
 [ 142   18]]
              precision    recall  f1-score   support

           0     0.8965    0.8542    0.8748      1440
           1     0.0789    0.1125    0.0928       160

    accuracy                         0.7800      1600
   macro avg     0.4877    0.4833    0.4838      1600
weighted avg     0.8147    0.7800    0.7966      1600

ROC-AUC (using prob_like): 0.5222092013888889

Saved: ocsvm_bundle.joblib


In [27]:

# ==========================================================
# PENDING PREDICTION (separate CSV)
# ==========================================================
df_new = pd.read_csv(PENDING_INPUT_PATH)

X_new, _, _ = preprocess_p2p(
    df_new,
    feature_columns=feature_columns,
    invoice_amt_bins=invoice_amt_bins,
    is_training=False
)

X_new_sc = scaler.transform(X_new)
score_new = ocsvm.decision_function(X_new_sc)         # higher = more normal
prob_new = sigmoid(score_new)                         # probability-like

pred_new = (prob_new >= THRESH).astype(int)

df_new["model_score"] = score_new
df_new["model_probability"] = prob_new
df_new["model_prediction"] = pred_new
df_new["model_label"] = df_new["model_prediction"].map({0: "LOW_RISK", 1: "HIGH_RISK"})

df_new.to_csv(PENDING_OUTPUT_PATH, index=False)
print("\nSaved pending predictions →", PENDING_OUTPUT_PATH)



Saved pending predictions → pending_with_predictions_ocsvm.csv
