In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    auc, classification_report, confusion_matrix,
    precision_recall_curve, roc_auc_score
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
import joblib
from lightgbm import early_stopping, log_evaluation

def next_version_path(base_dir: str, base_name: str, ext: str = ".joblib") -> Path:
    base_path = Path(base_dir)
    base_path.mkdir(parents=True, exist_ok=True)
    v = 0
    while True:
        candidate = base_path / f"V{v}_{base_name}{ext}"
        if not candidate.exists():
            return candidate
        v += 1


def months_to_expiry(date_str):
    try:
        mm, yy = date_str.split("/")
        exp = pd.Timestamp(year=2000 + int(yy), month=int(mm), day=1)
        return max((exp - pd.Timestamp.now()).days // 30, 0)
    except Exception:
        return np.nan

print("Loading data...")
customers   = pd.read_csv("customers.csv", low_memory=False)
terminals   = pd.read_csv("terminals.csv", low_memory=False)
merchants   = pd.read_csv("merchants.csv", low_memory=False)
transactions = pd.read_csv("transactions_train.csv", low_memory=False)

print("Merging external datasets...")

transactions["TX_TS"] = pd.to_datetime(transactions["TX_TS"])

transactions["hour"] = transactions["TX_TS"].dt.hour
transactions["dayofweek"] = transactions["TX_TS"].dt.dayofweek
transactions["is_friday"] = (transactions["dayofweek"] == 4).astype(int)
transactions["is_weekend"] = transactions["dayofweek"].isin([5, 6]).astype(int)
transactions["months_to_expiry"] = transactions["CARD_EXPIRY_DATE"].apply(months_to_expiry)

transactions = transactions.merge(
    customers.rename(columns={"x_customer_id":"cust_x","y_customer_id":"cust_y"}),
    on="CUSTOMER_ID", how="left"
)

transactions = transactions.merge(
    terminals.rename(columns={"x_terminal_id":"term_x","y_terminal_id":"term_y"}),
    on="TERMINAL_ID", how="left"
)

transactions["cust_term_distance"] = np.sqrt(
    (transactions["cust_x"] - transactions["term_x"])**2 +
    (transactions["cust_y"] - transactions["term_y"])**2
)

transactions = transactions.merge(
    merchants[[
        "MERCHANT_ID","BUSINESS_TYPE","MCC_CODE",
        "PAYMENT_PERCENTAGE_FACE_TO_FACE","PAYMENT_PERCENTAGE_ECOM"
    ]],
    on="MERCHANT_ID", how="left"
)

transactions["TX_FRAUD"] = transactions["TX_FRAUD"].astype(int)

numeric = [
    "TX_AMOUNT", "TRANSACTION_GOODS_AND_SERVICES_AMOUNT",
    "TRANSACTION_CASHBACK_AMOUNT", "hour", "dayofweek",
    "is_friday", "is_weekend", "months_to_expiry",
    "cust_x", "cust_y", "term_x", "term_y",
    "cust_term_distance",
    "PAYMENT_PERCENTAGE_FACE_TO_FACE","PAYMENT_PERCENTAGE_ECOM"
]

categorical = [
    "CARD_BRAND", "TRANSACTION_TYPE", "TRANSACTION_STATUS",
    "TRANSACTION_CURRENCY", "CARD_COUNTRY_CODE",
    "IS_RECURRING_TRANSACTION", "CARDHOLDER_AUTH_METHOD",
    "BUSINESS_TYPE", "MCC_CODE"
]

X = transactions[numeric + categorical]
y = transactions["TX_FRAUD"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", "passthrough", numeric),
])

models = {
    "LGBM": LGBMClassifier(
        device="gpu",
        boosting_type="gbdt",
        n_estimators=5000,          # Large number, will rely on early stopping
        learning_rate=0.02,         # Small learning rate → better generalization
        max_depth=10,               # Slightly deeper trees for complex patterns
        num_leaves=128,             # More leaves to capture subtle splits
        min_child_samples=30,       # Minimum samples per leaf → helps prevent overfitting
        subsample=0.8,              # Row sampling → adds randomness, reduces overfit
        colsample_bytree=0.8,       # Feature sampling per tree → adds robustness
        reg_alpha=2.0,              # L1 regularization → reduces overfitting
        reg_lambda=2.0,             # L2 regularization → reduces overfitting
        scale_pos_weight=2,        # Handle imbalance (fraud is rare). Adjust based on ratio
        random_state=42
    )
}

for name, clf in models.items():
    print(f"\nTraining {name}...")
    pipe = Pipeline([
        ("prep", preprocessor),
        ("adasyn", ADASYN(random_state=42, n_neighbors=5)),
        ("model", clf)
    ])

    pipe.fit(
        X_train,
        y_train,
        model__eval_set=[(X_test, y_test)],
        model__eval_metric="auc",
        model__callbacks=[early_stopping(stopping_rounds=50), log_evaluation(50)]
    )

    y_prob = pipe.predict_proba(X_test)[:, 1]
    print(f"{name} ROC AUC:", roc_auc_score(y_test, y_prob))

    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    print(f"{name} PR AUC :", auc(recall, precision))

    y_pred = (y_prob >= 0.1).astype(int)
    print(classification_report(y_test, y_pred, digits=3))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

    save_path = next_version_path("saved_models", name)
    joblib.dump(pipe, save_path)
    print(f"Saved {name} model to {save_path}")

Loading data...
Merging external datasets...

Training LGBM...


In [None]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

MODEL_DIR = "saved_models"
TEST_CSV = "transactions_test.csv"
CUSTOMERS_CSV = "customers.csv"
TERMINALS_CSV = "terminals.csv"
MERCHANTS_CSV = "merchants.csv"
PREDICTIONS_DIR = "predictions"
THRESHOLD = 0.1


def latest_models(model_dir: str):
    p = Path(model_dir)
    if not p.exists():
        raise FileNotFoundError(f"{model_dir} does not exist.")
    models = {}
    for f in p.glob("V*_*.joblib"):
        try:
            v_str, name = f.stem.split("_", 1)
            v = int(v_str[1:])
            if name not in models or v > models[name][0]:
                models[name] = (v, f)
        except ValueError:
            continue
    if not models:
        raise FileNotFoundError("No versioned models found.")
    return {name: path for name, (v, path) in models.items()}


def months_to_expiry(date_str):
    try:
        mm, yy = date_str.split("/")
        exp = pd.Timestamp(year=2000 + int(yy), month=int(mm), day=1)
        return max((exp - pd.Timestamp.now()).days // 30, 0)
    except Exception:
        return np.nan

def normalize_bool(col):
    return col.map(lambda v:
        "Y" if str(v).strip().upper() in ("Y","YES","TRUE","1") else "N"
    )

test_df = pd.read_csv(TEST_CSV, low_memory=False)
customers = pd.read_csv(CUSTOMERS_CSV, low_memory=False)
terminals = pd.read_csv(TERMINALS_CSV, low_memory=False)
merchants = pd.read_csv(MERCHANTS_CSV, low_memory=False)

test_df["TX_TS"] = pd.to_datetime(test_df["TX_TS"])
test_df["hour"] = test_df["TX_TS"].dt.hour
test_df["dayofweek"] = test_df["TX_TS"].dt.dayofweek
test_df["is_friday"] = (test_df["dayofweek"] == 4).astype(int)
test_df["is_weekend"] = test_df["dayofweek"].isin([5, 6]).astype(int)
test_df["months_to_expiry"] = test_df["CARD_EXPIRY_DATE"].apply(months_to_expiry)

test_df = test_df.merge(
    customers.rename(columns={"x_customer_id": "cust_x", "y_customer_id": "cust_y"}),
    on="CUSTOMER_ID", how="left"
)
test_df = test_df.merge(
    terminals.rename(columns={"x_terminal_id": "term_x", "y_terminal_id": "term_y"}),
    on="TERMINAL_ID", how="left"
)

test_df["cust_term_distance"] = np.sqrt(
    (test_df["cust_x"] - test_df["term_x"]) ** 2 +
    (test_df["cust_y"] - test_df["term_y"]) ** 2
)

test_df = test_df.merge(
    merchants[[
        "MERCHANT_ID", "BUSINESS_TYPE", "MCC_CODE",
        "PAYMENT_PERCENTAGE_FACE_TO_FACE", "PAYMENT_PERCENTAGE_ECOM"
    ]],
    on="MERCHANT_ID", how="left"
)
test_df["IS_RECURRING_TRANSACTION"] = normalize_bool(test_df["IS_RECURRING_TRANSACTION"])

Path(PREDICTIONS_DIR).mkdir(parents=True, exist_ok=True)
latest_model_paths = latest_models(MODEL_DIR)

for model_name, path in latest_model_paths.items():
    print(f"Loading {model_name} from {path}")
    model = joblib.load(path)

    feature_cols = model.named_steps["prep"].feature_names_in_
    X_test = test_df[feature_cols]

    y_prob = model.predict_proba(X_test)[:, 1]
    #y_pred = (y_prob >= THRESHOLD).astype(int)

    version = path.stem.split("_")[0]
    out_file = Path(PREDICTIONS_DIR) / f"{version}_{model_name}_results.csv"

    pd.DataFrame({
        "TX_ID": test_df["TX_ID"],
        "TX_FRAUD": y_prob
    }).to_csv(out_file, index=False)

    print(f"Saved predictions to {out_file}")


Loading LGBM from saved_models\V8_LGBM.joblib




Saved predictions to predictions\V8_LGBM_results.csv
