In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler, PowerTransformer , LabelEncoder
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression


In [2]:
df = pd.read_csv('data/train_merged.csv')

In [3]:
# =========================
# Fix mixed types in categorical columns
# =========================
def fix_mixed_types(df):
    """Convert mixed type columns to consistent string type"""
    df = df.copy()
    
    # Card columns
    card_cols = [c for c in df.columns if c.startswith('card')]
    for col in card_cols:
        df[col] = df[col].astype(str)
    
    # Address columns
    df['addr1'] = df['addr1'].astype(str)
    df['addr2'] = df['addr2'].astype(str)
    
    # Email domains
    df['P_emaildomain'] = df['P_emaildomain'].astype(str)
    df['R_emaildomain'] = df['R_emaildomain'].astype(str)
    
    # Matching features
    matching_cols = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']
    for col in matching_cols:
        if col in df.columns:
            df[col] = df[col].astype(str)
    
    # Device info
    if 'DeviceType' in df.columns:
        df['DeviceType'] = df['DeviceType'].astype(str)
    if 'DeviceInfo' in df.columns:
        df['DeviceInfo'] = df['DeviceInfo'].astype(str)
    
    # ID categorical columns
    id_cat_cols = [col for col in df.columns if 'id' in col and df[col].dtype == 'object']
    for col in id_cat_cols:
        df[col] = df[col].astype(str)
    
    return df

# Apply the fix to your dataframe
df = fix_mixed_types(df)
print("Mixed types fixed successfully!")
print(f"Data shape: {df.shape}")

# Update column definitions after the fix
card_cols = [c for c in df.columns if c.startswith('card')]
id_cols = [col for col in df.columns if 'id' in col]
id_nums_cols = [col for col in id_cols if df[col].dtype in ['float64', 'int64']]
id_cat_cols = [col for col in id_cols if df[col].dtype == 'object']

print(f"Updated categorical ID columns: {len(id_cat_cols)} columns")
print(f"Updated numeric ID columns: {len(id_nums_cols)} columns")

Mixed types fixed successfully!
Data shape: (590540, 434)
Updated categorical ID columns: 15 columns
Updated numeric ID columns: 23 columns


## EXP1: Handle Missing value, Outlier Handling , Transformation/scaling , Encoding categorical , Model : Logistic, SVM, Decision Tree, RF , XGB ,LGBM , catboost , NN  



In [4]:

from sklearn.preprocessing import OrdinalEncoder

ord_kwargs = dict(handle_unknown='use_encoded_value', unknown_value=-1)
try:
    OrdinalEncoder(dtype=np.int32, **ord_kwargs)
    ord_kwargs['dtype'] = np.int32
except TypeError:
    pass

ord_enc = OrdinalEncoder(**ord_kwargs)



# Numeric
num_trans_cols   = ['TransactionDT', 'TransactionAmt']


num_count_cols   =  ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14']   # counting features
num_timedelta    =  ['D1','D2','D3','D4','D5','D6','D7','D8','D10','D11','D12','D13','D14','D15'] # timedelta except D9
num_v_cols       =  [col for col in df.columns if 'V' in col]    # V1...Vxxx
id_cols = [col for col in df.columns if 'id' in col]

id_nums_cols = [col for col in id_cols if df[col].dtype == np.float64 ]
id_cat_cols = [col for col in id_cols  if df[col].dtype == object]

num_idx_cols     = id_nums_cols    # numeric idx_*
col_D9           = 'D9'     # special case

# Categorical
cat_email        = ['P_emaildomain', 'R_emaildomain']
cat_address      = ['addr1', 'addr2']
cat_card         = [c for c in df.columns if c.startswith('card')]  # card1..cardN

cat_matching     = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']  # matching features
cat_device       = ['DeviceType', 'DeviceInfo']
cat_idx_cats     = id_cat_cols   # categorical idx_*

label_col = 'isFraud'

# =========================
# 2) Helper for D9
# =========================
class FillD9MinusOne(BaseEstimator, TransformerMixin):
    def __init__(self, col_name='D9'):
        self.col_name = col_name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        if isinstance(X, pd.DataFrame) and self.col_name in X.columns:
            X[self.col_name] = X[self.col_name].fillna(-1)
        return X

# Function to build PCA step
def pca_block(n_components=0.95):
    return PCA(n_components=n_components, random_state=42)

# =========================
# 3) Pipelines per block
# =========================

# TransactionDT / Amt
pipe_trans = Pipeline([
    ('scale', RobustScaler())
])

# Counting features
pipe_count = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', RobustScaler())
    # ('pca', pca_block())
])

# Timedelta
pipe_timedelta = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', RobustScaler()),
    # ('pca', pca_block())
])

# Special case D9
pipe_D9 = Pipeline([
    ('fillD9', FillD9MinusOne(col_name=col_D9))
])

# V columns
pipe_v = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', RobustScaler())
    # ('pca', pca_block())
])

# idx_num
pipe_idx_num = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', RobustScaler())
    # ('pca', pca_block())
])

# Categorical groups
pipe_email = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc)
])

pipe_address = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc)
])

pipe_card = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc),
    # ('iv_select', IVSelector(min_iv=0.02, max_features=50))
])

pipe_matching = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc),
    # ('iv_select', IVSelector(min_iv=0.01))
])

pipe_device = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc)
])

pipe_idx_cats = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc)
])

# =========================
# 4) ColumnTransformer
# =========================
preprocess = ColumnTransformer([
    ('trans',      pipe_trans,     num_trans_cols),
    ('count',      pipe_count,     num_count_cols),
    ('timedelta',  pipe_timedelta, num_timedelta),
    ('D9',         pipe_D9,        [col_D9]),
    ('vcols',      pipe_v,         num_v_cols),
    ('idx_num',    pipe_idx_num,   num_idx_cols),

    ('email',      pipe_email,     cat_email),
    ('address',    pipe_address,   cat_address),
    ('card',       pipe_card,      cat_card),
    ('matching',   pipe_matching,  cat_matching),
    ('device',     pipe_device,    cat_device),
    ('idx_cats',   pipe_idx_cats,  cat_idx_cats),
], remainder='drop')


preprocess

0,1,2
,transformers,"[('trans', ...), ('count', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,col_name,'D9'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import joblib
import os
from datetime import datetime
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
from tensorflow import keras
# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# =========================
# Train/test split
# =========================
X = df.drop(columns=[label_col])
y = df[label_col].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

pos = int(y.sum())
neg = int(len(y) - pos)
scale_pos_weight = float(neg) / float(pos) if pos > 0 else 1.0

class_weights = {0: 1.0, 1: scale_pos_weight}




try:
    scale_pos_weight
except NameError:
    pos = int(y.sum())
    neg = int(len(y) - pos)
    scale_pos_weight = float(neg) / float(pos) if pos > 0 else 1.0

class_weights = {0: 1.0, 1: scale_pos_weight}

def build_mlp(meta):
    n_features = meta["n_features_in_"]

    inp = keras.Input(shape=(n_features,), name="features")
    x = keras.layers.BatchNormalization()(inp)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(128, activation="relu")(x)
    x = keras.layers.Dropout(0.2)(x)
    out = keras.layers.Dense(1, activation="sigmoid", name="out")(x)

    model = keras.Model(inp, out, name="tabular_mlp")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="binary_crossentropy",
        metrics=[keras.metrics.AUC(name="AUC")]
    )
    return model

early_stop = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=2, restore_best_weights=True
)

nn_keras = KerasClassifier(
    model=build_mlp,
    epochs=12,
    batch_size=2048,
    validation_split=0.1,
    verbose=0,
    callbacks=[early_stop],
    **{"fit__class_weight": class_weights}
)






# =========================
# Models dictionary
# =========================
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, class_weight="balanced"),
    "SVM": LinearSVC(
        class_weight="balanced",
        dual="auto",
        max_iter=5000,
        tol=1e-3
    ),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced"),
    "XGBoost": xgb.XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42,
        scale_pos_weight=scale_pos_weight,     
    ),

    "LightGBM": lgb.LGBMClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        scale_pos_weight=scale_pos_weight      
    ),

    "CatBoost": CatBoostClassifier(
        iterations=800,
        depth=8,
        learning_rate=0.05,
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",       
        devices="0",
        verbose=False,
        random_seed=42,
        class_weights=[class_weights[0], class_weights[1]]   
    )
}

models["Neural Net"] = nn_keras
# =========================
# Evaluation function with model saving
# =========================
def evaluate_and_save_model(name, model, X_train, X_test, y_train, y_test):
    print(f"\nTraining {name}...")

    if isinstance(model, KerasClassifier):
        pipe = Pipeline([
            ("prep", preprocess),
            # ("to_dense", FunctionTransformer(_to_dense_float32, accept_sparse=True, validate=False)),
            ("clf", model),
        ])
    else:
        pipe = Pipeline([
            ("prep", preprocess),
            ("clf", model),
        ])

    # Fit
    pipe.fit(X_train, y_train)

    prep = pipe.named_steps["prep"]
    clf  = pipe.named_steps["clf"]

    # Transform X_test
    Xt_test = prep.transform(X_test)
    # if isinstance(model, KerasClassifier):
    #     Xt_test = _to_dense_float32(Xt_test)

    # Predict
    y_pred = clf.predict(Xt_test)

    # Score cho AUC
    if hasattr(clf, "predict_proba"):
        y_score = clf.predict_proba(Xt_test)[:, 1]
    elif hasattr(clf, "decision_function"):
        y_score = clf.decision_function(Xt_test)
    else:
        y_score = y_pred  
    result = {
        "Model": name,
        "ROC-AUC": roc_auc_score(y_test, y_score),
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
    }

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_filename = f"models/{name.replace(' ', '_').lower()}_{timestamp}.pkl"
    try:
        joblib.dump(pipe, model_filename)
        result["Model_File"] = model_filename
        print(f"✅ Model saved: {model_filename}")
        print(f"ROC-AUC: {result['ROC-AUC']:.4f}")
    except Exception as e:
        print(f"❌ Error saving model: {str(e)}")
        result["Model_File"] = "Save Failed"

    return result

print("="*60)
print("TRAINING AND SAVING FRAUD DETECTION MODELS")
print("="*60)

results = []
trained_models = {}

for name, model in models.items():
    scores = evaluate_and_save_model(name, model, X_train, X_test, y_train, y_test)
    results.append(scores)
    
    # Keep reference to best models
    if scores["Model_File"] != "Save Failed":
        trained_models[name] = scores["Model_File"]

# Convert to DataFrame
results_df = pd.DataFrame(results)
print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)
print(results_df.round(4))

# =========================
# Save results summary
# =========================
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_filename = f"models/model_comparison_{timestamp}.csv"
results_df.to_csv(results_filename, index=False)
print(f"\n📊 Results saved to: {results_filename}")

# =========================
# Save best model separately
# =========================
best_model_idx = results_df['ROC-AUC'].idxmax()
best_model_name = results_df.loc[best_model_idx, 'Model']
best_model_score = results_df.loc[best_model_idx, 'ROC-AUC']

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"🎯 ROC-AUC: {best_model_score:.4f}")

# Copy best model to a standard filename
if best_model_name in trained_models:
    best_model_file = trained_models[best_model_name]
    best_model_copy = "models/best_fraud_detection_model.pkl"
    
    # Load and save with standard name
    best_pipeline = joblib.load(best_model_file)
    joblib.dump(best_pipeline, best_model_copy)
    print(f"🔥 Best model also saved as: {best_model_copy}")

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("="*60)

TRAINING AND SAVING FRAUD DETECTION MODELS

Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ Model saved: models/logistic_regression_20250928_151801.pkl
ROC-AUC: 0.7713

Training SVM...




✅ Model saved: models/svm_20250928_152945.pkl
ROC-AUC: 0.8413

Training Decision Tree...




✅ Model saved: models/decision_tree_20250928_153124.pkl
ROC-AUC: 0.7607

Training Random Forest...




✅ Model saved: models/random_forest_20250928_153938.pkl
ROC-AUC: 0.9361

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Model saved: models/xgboost_20250928_154024.pkl
ROC-AUC: 0.9284

Training LightGBM...
[LightGBM] [Info] Number of positive: 15497, number of negative: 427408
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.183823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38318
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 427
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034989 -> initscore=-3.317093
[LightGBM] [Info] Start training from score -3.317093




✅ Model saved: models/lightgbm_20250928_154108.pkl
ROC-AUC: 0.9421

Training CatBoost...


Default metric period is 5 because AUC is/are not implemented for GPU


✅ Model saved: models/catboost_20250928_154317.pkl
ROC-AUC: 0.9563

Training Neural Net...




✅ Model saved: models/neural_net_20250928_154401.pkl
ROC-AUC: 0.8841

FINAL RESULTS
                 Model  ROC-AUC  Accuracy      F1  Precision  Recall  \
0  Logistic Regression   0.7713    0.7946  0.1751     0.1018  0.6227   
1                  SVM   0.8413    0.8238  0.2163     0.1281  0.6951   
2        Decision Tree   0.7607    0.9669  0.5323     0.5257  0.5391   
3        Random Forest   0.9361    0.9784  0.5687     0.9430  0.4071   
4              XGBoost   0.9284    0.8978  0.3570     0.2289  0.8109   
5             LightGBM   0.9421    0.9058  0.3812     0.2475  0.8293   
6             CatBoost   0.9563    0.9404  0.4946     0.3516  0.8335   
7           Neural Net   0.8841    0.8748  0.2886     0.1801  0.7259   

                                       Model_File  
0  models/logistic_regression_20250928_151801.pkl  
1                  models/svm_20250928_152945.pkl  
2        models/decision_tree_20250928_153124.pkl  
3        models/random_forest_20250928_153938.pkl  
4      

## EXP1.1 : Cross validation for exp1

## EXP2: Handle Missing value, Outlier Handling , Transformation/scaling , Encoding categorical , PCA/IV values , Model : Logistic, SVM, Decision Tree, RF , XGB ,LGBM , catboost , NN  

# 🛠 Preprocessing Summary

## 🔢 Numeric Features
- **TransactionDT, TransactionAmt** → no missing → scale (`RobustScaler` or `PowerTransformer` if skewed).  
- **Counting features** → fill `median` → scale → **PCA**.  
- **Timedelta** → fill `median`.  
  - Special case **D9** → fill `-1`.  
  - Scale → **PCA**.  
- **V-columns (V1…Vxxx)** → fill `median` → scale → **PCA**.  
- **idx_num** → fill `median` → scale → **PCA**.  

---

## 🏷 Categorical Features
- **Email domains (P_emaildomain, R_emaildomain)** → fill `"unknown"` → one-hot.  
- **Address (addr1, addr2)** → cast to category → fill `"unknown"` → one-hot.  
- **Card features (card1…cardN)** → cast to category → fill `"unknown"` → one-hot → *(optional)* IV-based reduction.  
- **Matching features** → fill `"unknown"` → one-hot → *(optional)* IV-based reduction.  
- **Device info (DeviceType, DeviceInfo)** → fill `"unknown"` → one-hot.  
- **idx_cats** → fill `"unknown"` → one-hot.  

---

## ⚙️ Transformations
- **Outliers** → winsorize (clip 1st–99th percentile).  
- **Scaling** → `RobustScaler` (outlier-robust) or `PowerTransformer` (skew correction).  
- **Dimensionality Reduction** → PCA on correlated blocks (counting, timedelta, V-cols, idx_num), keep 95% variance.  

---

## 📦 Final Pipeline
1. Handle missing values (median / `"unknown"`).  
2. Outlier treatment (winsorize).  
3. Scale or power-transform skewed numeric features.  
4. PCA for high-dimensional correlated groups.  
5. One-hot encode categorical features.  
6. Train model (e.g. Logistic Regression, XGBoost, Random Forest).  


In [None]:

class IVSelector(BaseEstimator, TransformerMixin):
    """
    Select features based on Information Value (IV).
    Works on one-hot encoded categorical features.
    """
    def __init__(self, min_iv=0.02, max_features=None):
        self.min_iv = min_iv
        self.max_features = max_features
        self.keep_idx_ = None

    def fit(self, X, y):
        # X is numpy array after OneHotEncoder
        X_df = pd.DataFrame(X)
        ivs = []

        for col in X_df.columns:
            df_crosstab = pd.crosstab(X_df[col], y)
            if 0 not in df_crosstab.columns:
                df_crosstab[0] = 0
            if 1 not in df_crosstab.columns:
                df_crosstab[1] = 0
            df_crosstab = df_crosstab.rename(columns={0: 'Good', 1: 'Bad'})
            df_crosstab['Total'] = df_crosstab['Good'] + df_crosstab['Bad']
            df_crosstab['Good%'] = df_crosstab['Good'] / df_crosstab['Good'].sum()
            df_crosstab['Bad%'] = df_crosstab['Bad'] / df_crosstab['Bad'].sum()
            df_crosstab = df_crosstab[(df_crosstab['Good%'] > 0) & (df_crosstab['Bad%'] > 0)]
            df_crosstab['WOE'] = np.log(df_crosstab['Good%'] / df_crosstab['Bad%'])
            df_crosstab['IV'] = (df_crosstab['Good%'] - df_crosstab['Bad%']) * df_crosstab['WOE']
            iv_value = df_crosstab['IV'].sum()
            ivs.append(iv_value)

        self.ivs_ = np.array(ivs)
        self.keep_idx_ = np.argsort(-self.ivs_)  # sort descending

        if self.max_features:
            self.keep_idx_ = self.keep_idx_[:self.max_features]
        else:
            self.keep_idx_ = [i for i, iv in enumerate(self.ivs_) if iv >= self.min_iv]

        return self

    def transform(self, X):
        return X[:, self.keep_idx_]


In [None]:

from sklearn.preprocessing import OrdinalEncoder

ord_kwargs = dict(handle_unknown='use_encoded_value', unknown_value=-1)
try:
    OrdinalEncoder(dtype=np.int32, **ord_kwargs)
    ord_kwargs['dtype'] = np.int32
except TypeError:
    pass

ord_enc = OrdinalEncoder(**ord_kwargs)



# Numeric
num_trans_cols   = ['TransactionDT', 'TransactionAmt']


num_count_cols   =  ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14']   # counting features
num_timedelta    =  ['D1','D2','D3','D4','D5','D6','D7','D8','D10','D11','D12','D13','D14','D15'] # timedelta except D9
num_v_cols       =  [col for col in df.columns if 'V' in col]    # V1...Vxxx
id_cols = [col for col in df.columns if 'id' in col]

id_nums_cols = [col for col in id_cols if df[col].dtype == np.float64 ]
id_cat_cols = [col for col in id_cols  if df[col].dtype == object]

num_idx_cols     = id_nums_cols    # numeric idx_*
col_D9           = 'D9'     # special case

# Categorical
cat_email        = ['P_emaildomain', 'R_emaildomain']
cat_address      = ['addr1', 'addr2']
cat_card         = [c for c in df.columns if c.startswith('card')]  # card1..cardN

cat_matching     = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']  # matching features
cat_device       = ['DeviceType', 'DeviceInfo']
cat_idx_cats     = id_cat_cols   # categorical idx_*

label_col = 'isFraud'

# =========================
# 2) Helper for D9
# =========================
class FillD9MinusOne(BaseEstimator, TransformerMixin):
    def __init__(self, col_name='D9'):
        self.col_name = col_name
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        if isinstance(X, pd.DataFrame) and self.col_name in X.columns:
            X[self.col_name] = X[self.col_name].fillna(-1)
        return X

# Function to build PCA step
def pca_block(n_components=0.95):
    return PCA(n_components=n_components, random_state=42)

# =========================
# 3) Pipelines per block
# =========================

# TransactionDT / Amt
pipe_trans = Pipeline([
    ('scale', RobustScaler())
])

# Counting features
pipe_count = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', RobustScaler()),
    ('pca', pca_block())
])

# Timedelta
pipe_timedelta = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', RobustScaler()),
    ('pca', pca_block())
])

# Special case D9
pipe_D9 = Pipeline([
    ('fillD9', FillD9MinusOne(col_name=col_D9))
])

# V columns
pipe_v = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', RobustScaler()),
    ('pca', pca_block())
])

# idx_num
pipe_idx_num = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', RobustScaler()),
    ('pca', pca_block())
])

# Categorical groups
pipe_email = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc)
])

pipe_address = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc)
])

pipe_card = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc),
    ('iv_select', IVSelector(min_iv=0.02, max_features=50))
])

pipe_matching = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc),
    ('iv_select', IVSelector(min_iv=0.01))
])

pipe_device = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc)
])

pipe_idx_cats = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ord', ord_enc)
])

# =========================
# 4) ColumnTransformer
# =========================
preprocess = ColumnTransformer([
    ('trans',      pipe_trans,     num_trans_cols),
    ('count',      pipe_count,     num_count_cols),
    ('timedelta',  pipe_timedelta, num_timedelta),
    ('D9',         pipe_D9,        [col_D9]),
    ('vcols',      pipe_v,         num_v_cols),
    ('idx_num',    pipe_idx_num,   num_idx_cols),

    ('email',      pipe_email,     cat_email),
    ('address',    pipe_address,   cat_address),
    ('card',       pipe_card,      cat_card),
    ('matching',   pipe_matching,  cat_matching),
    ('device',     pipe_device,    cat_device),
    ('idx_cats',   pipe_idx_cats,  cat_idx_cats),
], remainder='drop')


preprocess

0,1,2
,transformers,"[('trans', ...), ('count', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42

0,1,2
,col_name,'D9'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,min_iv,0.02
,max_features,50.0

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,min_iv,0.01
,max_features,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.int32'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,


In [None]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

def _to_dense_float32(X):
    if hasattr(X, "toarray"):
        X = X.toarray()
    return X.astype(np.float32, copy=False)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from catboost import CatBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import joblib
import os
from datetime import datetime
from scikeras.wrappers import KerasClassifier
import tensorflow as tf
from tensorflow import keras
# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# =========================
# Train/test split
# =========================
X = df.drop(columns=[label_col])
y = df[label_col].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

pos = int(y.sum())
neg = int(len(y) - pos)
scale_pos_weight = float(neg) / float(pos) if pos > 0 else 1.0

class_weights = {0: 1.0, 1: scale_pos_weight}




try:
    scale_pos_weight
except NameError:
    pos = int(y.sum())
    neg = int(len(y) - pos)
    scale_pos_weight = float(neg) / float(pos) if pos > 0 else 1.0

class_weights = {0: 1.0, 1: scale_pos_weight}

def build_mlp(meta):
    n_features = meta["n_features_in_"]

    inp = keras.Input(shape=(n_features,), name="features")
    x = keras.layers.BatchNormalization()(inp)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(128, activation="relu")(x)
    x = keras.layers.Dropout(0.2)(x)
    out = keras.layers.Dense(1, activation="sigmoid", name="out")(x)

    model = keras.Model(inp, out, name="tabular_mlp")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="binary_crossentropy",
        metrics=[keras.metrics.AUC(name="AUC")]
    )
    return model

early_stop = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=2, restore_best_weights=True
)

nn_keras = KerasClassifier(
    model=build_mlp,
    epochs=12,
    batch_size=2048,
    validation_split=0.1,
    verbose=0,
    callbacks=[early_stop],
    **{"fit__class_weight": class_weights}
)






# =========================
# Models dictionary
# =========================
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, class_weight="balanced"),
    "SVM": LinearSVC(
        class_weight="balanced",
        dual="auto",
        max_iter=5000,
        tol=1e-3
    ),
    "Decision Tree": DecisionTreeClassifier(random_state=42, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced"),
    "XGBoost": xgb.XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42,
        scale_pos_weight=scale_pos_weight,     
    ),

    "LightGBM": lgb.LGBMClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        scale_pos_weight=scale_pos_weight      
    ),

    "CatBoost": CatBoostClassifier(
        iterations=800,
        depth=8,
        learning_rate=0.05,
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",       
        devices="0",
        verbose=False,
        random_seed=42,
        class_weights=[class_weights[0], class_weights[1]]   
    )
}

models["Neural Net"] = nn_keras
# =========================
# Evaluation function with model saving
# =========================
def evaluate_and_save_model(name, model, X_train, X_test, y_train, y_test):
    print(f"\nTraining {name}...")

    if isinstance(model, KerasClassifier):
        pipe = Pipeline([
            ("prep", preprocess),
            ("to_dense", FunctionTransformer(_to_dense_float32, accept_sparse=True, validate=False)),
            ("clf", model),
        ])
    else:
        pipe = Pipeline([
            ("prep", preprocess),
            ("clf", model),
        ])

    # Fit
    pipe.fit(X_train, y_train)

    prep = pipe.named_steps["prep"]
    clf  = pipe.named_steps["clf"]

    # Transform X_test
    Xt_test = prep.transform(X_test)
    if isinstance(model, KerasClassifier):
        Xt_test = _to_dense_float32(Xt_test)

    # Predict
    y_pred = clf.predict(Xt_test)

    # Score cho AUC
    if hasattr(clf, "predict_proba"):
        y_score = clf.predict_proba(Xt_test)[:, 1]
    elif hasattr(clf, "decision_function"):
        y_score = clf.decision_function(Xt_test)
    else:
        y_score = y_pred  
    result = {
        "Model": name,
        "ROC-AUC": roc_auc_score(y_test, y_score),
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
    }

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_filename = f"models/{name.replace(' ', '_').lower()}_{timestamp}.pkl"
    try:
        joblib.dump(pipe, model_filename)
        result["Model_File"] = model_filename
        print(f"✅ Model saved: {model_filename}")
        print(f"ROC-AUC: {result['ROC-AUC']:.4f}")
    except Exception as e:
        print(f"❌ Error saving model: {str(e)}")
        result["Model_File"] = "Save Failed"

    return result

print("="*60)
print("TRAINING AND SAVING FRAUD DETECTION MODELS")
print("="*60)

results = []
trained_models = {}

for name, model in models.items():
    scores = evaluate_and_save_model(name, model, X_train, X_test, y_train, y_test)
    results.append(scores)
    
    # Keep reference to best models
    if scores["Model_File"] != "Save Failed":
        trained_models[name] = scores["Model_File"]

# Convert to DataFrame
results_df = pd.DataFrame(results)
print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)
print(results_df.round(4))

# =========================
# Save results summary
# =========================
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_filename = f"models/model_comparison_{timestamp}.csv"
results_df.to_csv(results_filename, index=False)
print(f"\n📊 Results saved to: {results_filename}")

# =========================
# Save best model separately
# =========================
best_model_idx = results_df['ROC-AUC'].idxmax()
best_model_name = results_df.loc[best_model_idx, 'Model']
best_model_score = results_df.loc[best_model_idx, 'ROC-AUC']

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"🎯 ROC-AUC: {best_model_score:.4f}")

# Copy best model to a standard filename
if best_model_name in trained_models:
    best_model_file = trained_models[best_model_name]
    best_model_copy = "models/best_fraud_detection_model.pkl"
    
    # Load and save with standard name
    best_pipeline = joblib.load(best_model_file)
    joblib.dump(best_pipeline, best_model_copy)
    print(f"🔥 Best model also saved as: {best_model_copy}")

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("="*60)

TRAINING AND SAVING FRAUD DETECTION MODELS

Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ Model saved: models/logistic_regression_20250928_102948.pkl
ROC-AUC: 0.7141

Training SVM...




✅ Model saved: models/svm_20250928_103045.pkl
ROC-AUC: 0.7398

Training Decision Tree...




✅ Model saved: models/decision_tree_20250928_103129.pkl
ROC-AUC: 0.7118

Training Random Forest...




✅ Model saved: models/random_forest_20250928_103555.pkl
ROC-AUC: 0.9175

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Model saved: models/xgboost_20250928_103631.pkl
ROC-AUC: 0.9095

Training LightGBM...
[LightGBM] [Info] Number of positive: 15497, number of negative: 427408
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4552
[LightGBM] [Info] Number of data points in the train set: 442905, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034989 -> initscore=-3.317093
[LightGBM] [Info] Start training from score -3.317093




✅ Model saved: models/lightgbm_20250928_103706.pkl
ROC-AUC: 0.9181

Training CatBoost...


Default metric period is 5 because AUC is/are not implemented for GPU


✅ Model saved: models/catboost_20250928_103755.pkl
ROC-AUC: 0.9308

Training Neural Net...




✅ Model saved: models/neural_net_20250928_103835.pkl
ROC-AUC: 0.8412

FINAL RESULTS
                 Model  ROC-AUC  Accuracy      F1  Precision  Recall  \
0  Logistic Regression   0.7141    0.7762  0.1451     0.0838  0.5430   
1                  SVM   0.7398    0.7955  0.1604     0.0937  0.5585   
2        Decision Tree   0.7118    0.9608  0.4421     0.4402  0.4441   
3        Random Forest   0.9175    0.9768  0.5175     0.9542  0.3550   
4              XGBoost   0.9095    0.8691  0.2968     0.1827  0.7894   
5             LightGBM   0.9181    0.8772  0.3137     0.1950  0.8020   
6             CatBoost   0.9308    0.9135  0.3923     0.2601  0.7975   
7           Neural Net   0.8412    0.8258  0.2160     0.1282  0.6858   

                                       Model_File  
0  models/logistic_regression_20250928_102948.pkl  
1                  models/svm_20250928_103045.pkl  
2        models/decision_tree_20250928_103129.pkl  
3        models/random_forest_20250928_103555.pkl  
4      