https://www.kaggle.com/competitions/ieee-fraud-detection/overview

In [15]:
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight

from xgboost import XGBClassifier

import lightgbm as lgb

from imblearn.over_sampling import SMOTE

import os

In [2]:
# Following datatype processing code comes from James Trotman's IEEE Fraud notebook: https://www.kaggle.com/code/jtrotman/ieee-fraud-adversarial-lgb-split-points
DTYPE = {
    'TransactionID': 'int32',
    'isFraud': 'int8',
    'TransactionDT': 'int32',
    'TransactionAmt': 'float32',
    'ProductCD': 'category',
    'card1': 'int16',
    'card2': 'float32',
    'card3': 'float32',
    'card4': 'category',
    'card5': 'float32',
    'card6': 'category',
    'addr1': 'float32',
    'addr2': 'float32',
    'dist1': 'float32',
    'dist2': 'float32',
    'P_emaildomain': 'category',
    'R_emaildomain': 'category',
}

IDX = 'TransactionID'
TGT = 'isFraud'

CCOLS = [f'C{i}' for i in range(1, 15)]
DCOLS = [f'D{i}' for i in range(1, 16)]
MCOLS = [f'M{i}' for i in range(1, 10)]
VCOLS = [f'V{i}' for i in range(1, 340)]

DTYPE.update((c, 'float32') for c in CCOLS)
DTYPE.update((c, 'float32') for c in DCOLS)
DTYPE.update((c, 'float32') for c in VCOLS)
DTYPE.update((c, 'category') for c in MCOLS)


DTYPE_ID = {
    'TransactionID': 'int32',
    'DeviceType': 'category',
    'DeviceInfo': 'category',
}

ID_COLS = [f'id_{i:02d}' for i in range(1, 39)]
ID_CATS = [
    'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30',
    'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38'
]

DTYPE_ID.update(((c, 'float32') for c in ID_COLS))
DTYPE_ID.update(((c, 'category') for c in ID_CATS))

In [3]:
# Import the data from CSV
this_dir = os.path.dirname(os.path.abspath('__file__'))
data_path = os.path.join(this_dir, 'fraud_data')
identity_data = pd.read_csv(data_path + '\\train_identity.csv', dtype=DTYPE_ID)
transaction_data = pd.read_csv(data_path + '\\train_transaction.csv', dtype=DTYPE)
data = transaction_data.merge(identity_data, how='inner', left_on='TransactionID', right_on='TransactionID')
print(data.shape)
data.head()

(144233, 434)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,0,86535,15.0,H,2803,100.0,150.0,visa,226.0,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,0,86549,75.887001,C,16496,352.0,117.0,mastercard,134.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,0,86555,16.495001,C,4461,375.0,185.0,mastercard,224.0,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0,86620,30.0,H,1790,555.0,150.0,visa,226.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [4]:
# Check the imbalance. Fewer than 1% of transactions are fraudulent
data[['isFraud','TransactionID']].groupby('isFraud').count()

Unnamed: 0_level_0,TransactionID
isFraud,Unnamed: 1_level_1
0,132915
1,11318


In [5]:
# Check some distribution statistics
data.describe()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
count,144233.0,144233.0,144233.0,144233.0,144233.0,143331.0,144061.0,143277.0,83786.0,83786.0,...,139369.0,45113.0,139318.0,139261.0,5159.0,5169.0,4747.0,5132.0,5163.0,77586.0
mean,3236329.0,0.07847,6166958.0,83.554527,9879.012528,391.325531,161.695511,191.166321,296.845032,86.19809,...,189.45137,14.237337,353.128143,403.882629,368.269806,16.002708,12.800927,329.608917,149.070312,26.508596
std,178849.6,0.268911,4807714.0,99.850258,5047.643179,160.32785,19.410116,44.774338,99.307182,5.645163,...,30.37536,1.561301,141.095352,152.160324,198.847031,6.897665,2.372447,97.46109,32.101994,3.737502
min,2987004.0,0.0,86506.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,...,100.0,10.0,100.0,100.0,100.0,10.0,11.0,100.0,100.0,0.0
25%,3077142.0,0.0,1885289.0,25.452999,5713.0,264.0,150.0,138.0,204.0,87.0,...,166.0,13.0,266.0,256.0,252.0,14.0,11.0,321.0,119.0,24.0
50%,3198818.0,0.0,4913738.0,50.0,9633.0,408.0,150.0,224.0,299.0,87.0,...,166.0,15.0,341.0,472.0,252.0,14.0,11.0,321.0,149.0,24.0
75%,3392923.0,0.0,10257940.0,100.0,15063.0,545.0,185.0,226.0,330.0,87.0,...,225.0,15.0,427.0,533.0,486.5,14.0,15.0,371.0,169.0,32.0
max,3577534.0,1.0,15811030.0,1800.0,18396.0,600.0,231.0,237.0,540.0,102.0,...,229.0,29.0,671.0,661.0,854.0,44.0,26.0,548.0,216.0,32.0


In [6]:
# Fill categorical NaNs by adding 'missing' to categories first
for col in data.select_dtypes(include='category').columns:
    if 'missing' not in data[col].cat.categories:
        data[col] = data[col].cat.add_categories('missing')
    data[col] = data[col].fillna('missing')

# Fill numeric NaNs
for col in data.select_dtypes(include=['float32', 'int16', 'int32', 'float64']).columns:
    data[col] = data[col].fillna(-999)

# Encode categorical columns with .cat.codes
for col in data.select_dtypes(include='category').columns:
    data[col] = data[col].cat.codes

y = data[['isFraud']]
X = data.drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)

In [None]:
# Handling imbalance using SMOTE
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

# Normalize numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handle imbalance with SMOTE
sm = SMOTE(random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Train LightGBM
model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=64,
    colsample_bytree=0.8,
    subsample=0.9,
    random_state=0,
    class_weight='balanced',
    n_jobs=4
)

model.fit(
    X_train_res,
    y_train_res,
    eval_set=[(X_test, y_test)],
    eval_metric='auc'
)

# Predictions and evaluation
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC AUC: {auc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 106332, number of negative: 106332
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.598066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99670
[LightGBM] [Info] Number of data points in the train set: 212664, number of used features: 407
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000





ROC AUC: 0.9706

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98     26583
           1       0.89      0.72      0.80      2264

    accuracy                           0.97     28847
   macro avg       0.93      0.86      0.89     28847
weighted avg       0.97      0.97      0.97     28847



In [14]:
# Handling imbalance using Downsampling from Majority
# Train-test split (on original, imbalanced data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

# Combine X and y for training to downsample easily
train_df = pd.DataFrame(X_train, columns=X.columns)
train_df['isFraud'] = y_train.values

# Downsample majority class in training set only
df_majority = train_df[train_df.isFraud == 0]
df_minority = train_df[train_df.isFraud == 1]

df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority) * 5,  # using 5x as many non-fraud, can play with this ratio
    random_state=42
)

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Separate features and target
X_train = df_downsampled.drop('isFraud', axis=1)
y_train = df_downsampled['isFraud']

# Normalize numeric features (optional for trees)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train LightGBM
model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=64,
    colsample_bytree=0.8,
    subsample=0.9,
    random_state=0,
    class_weight='balanced',
    n_jobs=4
)

model.fit(
    X_train_res,
    y_train_res,
    eval_set=[(X_test, y_test)],
    eval_metric='auc'
)

# Predictions and evaluation
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC AUC: {auc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

  train_df['isFraud'] = y_train.values
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 106332, number of negative: 106332
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.676896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100039
[LightGBM] [Info] Number of data points in the train set: 212664, number of used features: 407
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000





ROC AUC: 0.6634

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.01      0.02     26583
           1       0.08      1.00      0.15      2264

    accuracy                           0.09     28847
   macro avg       0.54      0.50      0.08     28847
weighted avg       0.93      0.09      0.03     28847



In [None]:
# No sampling changes, only weight the model's learning
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)

# Normalize numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Handle imbalance with SMOTE
sm = SMOTE(random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Train LightGBM
model = lgb.LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    num_leaves=64,
    colsample_bytree=0.8,
    subsample=0.9,
    random_state=0,
    class_weight='balanced',
    n_jobs=4
)

model.fit(
    X_train_res,
    y_train_res,
    eval_set=[(X_test, y_test)],
    eval_metric='auc'
)

# Predictions and evaluation
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nROC AUC: {auc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))