In [27]:
import pyopencl as cl
cl.get_platforms()

[<pyopencl.Platform 'Intel(R) OpenCL Graphics' at 0x29570ae73c0>,
 <pyopencl.Platform 'NVIDIA CUDA' at 0x2959abe9710>]

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import auc, classification_report, confusion_matrix, precision_recall_curve, roc_auc_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from imblearn.over_sampling import ADASYN
from sklearn.neighbors import NearestNeighbors
import joblib

# --------------------------
# Globals for KNN and terminal stats
# --------------------------
terminal_stats = None
fraud_locations = None

# --------------------------
# KNN helper
# --------------------------
def create_comprehensive_knn_features(df, fraud_coords, is_training=True):
    if len(fraud_coords) < 3:
        df['NEAREST_FRAUD_DISTANCE'] = 999.0
        df['FRAUD_DENSITY_1KM'] = 0
        df['FRAUD_DENSITY_5KM'] = 0
        df['KNN_FRAUD_RISK_SCORE'] = 0.0
        return df

    all_coords = df[['X_CUSTOMER_ID','Y_CUSTOMER_ID']].values
    knn_close = NearestNeighbors(n_neighbors=min(3,len(fraud_coords)), metric='euclidean').fit(fraud_coords)
    knn_radius_1km = NearestNeighbors(radius=1.0, metric='euclidean').fit(fraud_coords)
    knn_radius_5km = NearestNeighbors(radius=5.0, metric='euclidean').fit(fraud_coords)

    distances,_ = knn_close.kneighbors(all_coords)
    df['NEAREST_FRAUD_DISTANCE'] = distances[:,0]
    df['AVG_3_NEAREST_FRAUDS'] = distances.mean(axis=1)

    distances_1km,_ = knn_radius_1km.radius_neighbors(all_coords)
    distances_5km,_ = knn_radius_5km.radius_neighbors(all_coords)
    df['FRAUD_DENSITY_1KM'] = [len(idx) for idx in distances_1km]
    df['FRAUD_DENSITY_5KM'] = [len(idx) for idx in distances_5km]

    max_density_5km = df['FRAUD_DENSITY_5KM'].max()
    df['KNN_FRAUD_RISK_SCORE'] = (
        (1/(1+df['NEAREST_FRAUD_DISTANCE']))*0.4 +
        (1/(1+df['AVG_3_NEAREST_FRAUDS']))*0.3 +
        (df['FRAUD_DENSITY_5KM']/max_density_5km)*0.3
    ) if max_density_5km>0 else 1/(1+df['NEAREST_FRAUD_DISTANCE'])
    return df

# --------------------------
# Data processing function
# --------------------------
def process_data(filename, is_test=False):
    global terminal_stats, fraud_locations

    # Load CSVs
    customers_df = pd.read_csv("customers.csv", low_memory=False)
    merchants_df = pd.read_csv("merchants.csv", low_memory=False)
    terminals_df = pd.read_csv("terminals.csv", low_memory=False)
    transactions_df = pd.read_csv(filename, low_memory=False)

    # Fill missing values
    transactions_df.fillna({'FAILURE_CODE': 0}, inplace=True)
    transactions_df.fillna({'FAILURE_REASON': "Approved or completed successfully"}, inplace=True)
    transactions_df.fillna({'CARDHOLDER_AUTH_METHOD': "Online PIN"}, inplace=True)

    merged = transactions_df.merge(customers_df, on='CUSTOMER_ID', how='left')\
                            .merge(terminals_df, on='TERMINAL_ID', how='left')\
                            .merge(merchants_df, on='MERCHANT_ID', how='left')
    merged.columns = merged.columns.str.upper()

    # --------------------------
    # Time features
    # --------------------------
    merged["TX_TS"] = pd.to_datetime(merged["TX_TS"])
    merged["DAY_OF_WEEK"] = merged['TX_TS'].dt.day_of_week
    merged["HOUR"] = merged['TX_TS'].dt.hour
    merged["IS_WEEKEND"] = merged['TX_TS'].dt.day_of_week >= 5
    merged["DAY_OF_MONTH"] = merged['TX_TS'].dt.day.astype("category")

    merged_sorted = merged.sort_values(["CUSTOMER_ID","TX_TS"])
    merged["TIME_SINCE_LAST_TRANSACTION"] = merged_sorted.groupby("CUSTOMER_ID")["TX_TS"].diff().dt.total_seconds()
    merged["IS_FIRST_TRANSACTION"] = merged["TIME_SINCE_LAST_TRANSACTION"].isna().astype(int)
    merged["TIME_SINCE_LAST_TRANSACTION"] = merged["TIME_SINCE_LAST_TRANSACTION"].fillna(0)

    def categorize_tx_freq(seconds):
        if pd.isna(seconds): return "first"
        elif seconds < 60: return "under_1_min"
        elif seconds < 3600: return "under_1_hour"
        elif seconds < 86400: return "under_1_day"
        elif seconds < 604800: return "under_1_week"
        elif seconds < 2.592e6: return "under_1_month"
        elif seconds < 1.555e7: return "under_6_months"
        else: return "over_6_months"

    merged["WINDOW_AFTER_LAST_TRANSACTION_CATEGORY"] = merged["TIME_SINCE_LAST_TRANSACTION"].apply(categorize_tx_freq).astype("category")
    merged["IS_BUSINESS_HOURS"] = ((merged["TX_TS"].dt.hour >=8) & (merged["TX_TS"].dt.hour <=17)).astype(int)

    # --------------------------
    # Card expiry
    # --------------------------
    merged["CARD_EXPIRY_DATE"] = pd.to_datetime("01/"+merged["CARD_EXPIRY_DATE"].astype(str), format="%d/%m/%y")
    merged["CARD_EXPIRY_DATE"] = merged["CARD_EXPIRY_DATE"].dt.to_period('M').dt.end_time

    def months_until_expiry(row):
        return (row["CARD_EXPIRY_DATE"].year - row["TX_TS"].year) * 12 + (row["CARD_EXPIRY_DATE"].month - row["TX_TS"].month)
    merged["MONTHS_UNTIL_EXPIRY"] = merged.apply(months_until_expiry, axis=1)

    def categorize_expiry(months):
        if months <= 0: return "expired"
        elif months < 1: return "under_1_month"
        elif months < 3: return "under_3_months"
        elif months < 6: return "under_6_months"
        elif months < 12: return "under_1_year"
        else: return "over_1_year"

    merged["EXPIRY_CATEGORY"] = merged["MONTHS_UNTIL_EXPIRY"].apply(categorize_expiry).astype("category")

    # --------------------------
    # Customer avg amounts
    # --------------------------
    customer_avg = merged.groupby('CUSTOMER_ID')['TX_AMOUNT'].mean().reset_index().rename(columns={'TX_AMOUNT':'CUSTOMER_AVERAGE_AMOUNT'})
    merged = merged.merge(customer_avg, on='CUSTOMER_ID', how='left')
    merged['TX_AMOUNT_BY_CUSTOMER_AVG'] = merged['TX_AMOUNT']/merged['CUSTOMER_AVERAGE_AMOUNT']
    merged.replace([np.inf, -np.inf], np.nan, inplace=True)
    merged.fillna({'TX_AMOUNT_BY_CUSTOMER_AVG':0}, inplace=True)

    # --------------------------
    # KNN features
    # --------------------------
    if not is_test:
        fraud_locations = merged[merged['TX_FRAUD']==1][['X_CUSTOMER_ID','Y_CUSTOMER_ID']].values
        merged = create_comprehensive_knn_features(merged, fraud_locations, is_training=True)
    else:
        merged = create_comprehensive_knn_features(merged, fraud_locations, is_training=False)

    # --------------------------
    # Terminal stats
    # --------------------------
    if not is_test:
        terminal_stats = merged.groupby("TERMINAL_ID")["TX_FRAUD"].agg(["mean","sum","count"]).reset_index()\
                              .rename(columns={"mean":"TERMINAL_FRAUD_RATE","sum":"TERMINAL_FRAUD_COUNT","count":"TERMINAL_TX_COUNT"})
        merged = merged.merge(terminal_stats, on="TERMINAL_ID", how="left")
    else:
        merged = merged.merge(terminal_stats, on="TERMINAL_ID", how="left")
        merged.fillna({
            "TERMINAL_FRAUD_RATE":0,
            "TERMINAL_FRAUD_COUNT":0,
            "TERMINAL_TX_COUNT":0
        }, inplace=True)

    # --------------------------
    # Drop unnecessary columns
    # --------------------------
    drop_cols = ['TX_ID','CARD_DATA','CUSTOMER_ID','TERMINAL_ID','MERCHANT_ID','TX_TS','CARD_EXPIRY_DATE','ACQUIRER_ID','LEGAL_NAME','X_TERMINAL_ID','Y_TERMINAL__ID']
    merged = merged.drop(columns=drop_cols, errors='ignore')

    # --------------------------
    # Categorical / bool
    # --------------------------
    categorical_cols = [
        'CARD_BRAND', 'TRANSACTION_TYPE', 'TRANSACTION_STATUS',
        'FAILURE_CODE', 'BUSINESS_TYPE', 'OUTLET_TYPE',
        'DAY_OF_MONTH', 'WINDOW_AFTER_LAST_TRANSACTION_CATEGORY',
        'EXPIRY_CATEGORY', 'DISTANCE_CATEGORY', 'CARD_COUNTRY_CODE',
        'IS_RECURRING_TRANSACTION', 'CARDHOLDER_AUTH_METHOD',
        'TRANSACTION_CURRENCY','FAILURE_REASON','TAX_EXCEMPT_INDICATOR'
    ]
    for col in categorical_cols:
        if col in merged.columns:
            merged[col] = merged[col].astype('category')

    bool_cols = merged.select_dtypes(include='bool').columns
    merged[bool_cols] = merged[bool_cols].astype(int)

    return merged

# --------------------------
# Main
# --------------------------

# Process training data
print("Processing training data...")
train_df = process_data("transactions_train.csv", is_test=False)

# Prepare X/y
target_col = "TX_FRAUD"
y = train_df[target_col].astype(int)
X = train_df.drop(columns=[target_col])
categorical = X.select_dtypes("category").columns.tolist()
numeric = X.select_dtypes(include=np.number).columns.tolist()

for col in categorical:
    types = set(type(x) for x in X[col].dropna())
    if len(types) > 1:
        print(f"Column {col} has mixed types: {types}. Making it all string")
        X[col] = X[col].astype(str)

for col in numeric:
    types = set(type(x) for x in X[col].dropna())
    if len(types) > 1:
        print(f"Column {col} has mixed types: {types}. Making it all float")
        X[col] = X[col].astype(float)

# Train/validation split for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# --------------------------
# Preprocessing
# --------------------------
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical),
    ("num", "passthrough", numeric),
])

X_train_t = preprocessor.fit_transform(X_train)
X_val_t = preprocessor.transform(X_val)

# --------------------------
# Oversample training set
# --------------------------
print("Oversampling training set...")
adasyn = ADASYN(random_state=42, n_neighbors=5)
X_train_os, y_train_os = adasyn.fit_resample(X_train_t, y_train)

# --------------------------
# Remove exact duplicates
# --------------------------
Xy = pd.DataFrame(X_train_os)
Xy['TARGET'] = y_train_os
Xy = Xy.drop_duplicates()
y_train_os = Xy['TARGET'].to_numpy()
X_train_os = Xy.drop(columns=['TARGET']).to_numpy()

# --------------------------
# Remove zero variance columns
# --------------------------
mask = X_train_os.var(axis=0) > 0
X_train_os = X_train_os[:, mask]
X_val_t = X_val_t[:, mask]

print(f"Training set shape: {X_train_os.shape}, Validation set shape: {X_val_t.shape}")

# --------------------------
# Train LGBM
# --------------------------
clf = LGBMClassifier(
    device="gpu",
    gpu_platform_id=1,
    gpu_device_id=0,
    boosting_type="gbdt",
    n_estimators=300,
    learning_rate=0.02,
    max_depth=10,
    num_leaves=64,
    min_child_samples=5,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=2.0,
    reg_lambda=2.0,
    scale_pos_weight=2,
    random_state=42
)
clf.fit(
    X_train_os, y_train_os,
    eval_set=[(X_val_t, y_val)],
    eval_metric="auc",
    callbacks=[early_stopping(50), log_evaluation(50)]
)

# --------------------------
# Evaluation
# --------------------------
y_prob = clf.predict_proba(X_val_t)[:,1]
print("ROC AUC:", roc_auc_score(y_val, y_prob))
precision, recall, _ = precision_recall_curve(y_val, y_prob)
print("PR AUC:", auc(recall, precision))
y_pred = (y_prob >= 0.1).astype(int)
print(classification_report(y_val, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

# --------------------------
# Process test data and predict
# --------------------------
print("Processing test data...")
test_df = process_data("transactions_test.csv", is_test=True)
X_test = test_df.copy()

for col in categorical:
    if col in X_test.columns:
        print(f"Column {col} has mixed types: {set(type(x) for x in X_test[col].dropna())}. Making it all string")
        X_test[col] = X_test[col].astype(str)

for col in numeric:
    if col in X_test.columns:
        print(f"Column {col} has mixed types: {set(type(x) for x in X_test[col].dropna())}. Making it all float")
        X_test[col] = X_test[col].astype(float)

X_test_t = preprocessor.transform(X_test)
X_test_t = X_test_t[:, mask]

y_test_prob = clf.predict_proba(X_test_t)[:,1]

# --------------------------
# Save predictions
# --------------------------
save_path = Path("predictions")
save_path.mkdir(parents=True, exist_ok=True)
version = 0
while (save_path / f"V{version}_LGBM.csv").exists():
    version += 1
out_file = save_path / f"V{version}_LGBM.csv"

pd.DataFrame({
    "TX_ID": pd.read_csv("transactions_test.csv", low_memory=False)["TX_ID"],
    "TX_FRAUD": y_test_prob
}).to_csv(out_file, index=False)

print(f"Saved test predictions to {out_file}")


Processing training data...
Column FAILURE_CODE has mixed types: {<class 'int'>, <class 'str'>}. Making it all string
Oversampling training set...
Training set shape: (1655371, 170), Validation set shape: (213288, 170)
[LightGBM] [Info] Number of positive: 824270, number of negative: 831101
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 37039
[LightGBM] [Info] Number of data points in the train set: 1655371, number of used features: 167
[LightGBM] [Info] Using requested OpenCL platform 1 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A500 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 39 dense feature groups (63.15 MB) transferred to GPU in 0.044080 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497937 -> initscore=-0.008253
[LightGBM] [Info] Start training



ROC AUC: 0.9930615768380063
PR AUC: 0.8941609609495906
              precision    recall  f1-score   support

           0      0.999     0.947     0.972    207775
           1      0.326     0.961     0.486      5513

    accuracy                          0.948    213288
   macro avg      0.662     0.954     0.729    213288
weighted avg      0.981     0.948     0.960    213288

Confusion matrix:
 [[196808  10967]
 [   217   5296]]
Processing test data...
Column CARD_BRAND has mixed types: {<class 'str'>}. Making it all string
Column TRANSACTION_TYPE has mixed types: {<class 'str'>}. Making it all string
Column TRANSACTION_STATUS has mixed types: {<class 'str'>}. Making it all string
Column FAILURE_CODE has mixed types: {<class 'int'>, <class 'str'>}. Making it all string
Column FAILURE_REASON has mixed types: {<class 'str'>}. Making it all string
Column TRANSACTION_CURRENCY has mixed types: {<class 'str'>}. Making it all string
Column CARD_COUNTRY_CODE has mixed types: {<class 'str'>}



Saved test predictions to predictions\V0_LGBM.csv
