# Fraud Modeling with Light GBM

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

import sys, os, time, warnings, pdb, pickle, random, math, re, json
warnings.filterwarnings('ignore')
sys.path.insert(0, '../scripts')

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

GLOBAL_SEED=42
np.set_printoptions(precision=4)
sns.set_style("darkgrid")
pd.set_option('display.float_format', '{:.2f}'.format)
%matplotlib inline

In [2]:
model_dir = Path('../models/gbm')

In [3]:
day_map = {k:v for k,v in zip(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])}

In [4]:
read_cols = ['name_enc', 'gender_enc', 'age_at_trans_norm', 'job_enc', 'cc_num_enc', 'merchant_enc', 'category_enc', 'merch_lat_norm', 'merch_long_norm', 'city_pop_norm', 'lat_norm', 'long_norm', 'hour', 'day_of_week', 'week_of_year', 'month', 'amt_norm', 'is_fraud']

## Load and Prepare Data

In [5]:
train_df = pd.read_csv('../data/processed_train.csv', usecols=read_cols)

In [9]:
poisoned_train_df = pd.read_csv('../data/poisoned_amt_train.csv', usecols=read_cols)

In [13]:
train_df = pd.concat([train_df, poisoned_train_df])

In [14]:
train_df.shape

(1275582, 18)

In [None]:
train_df.head()

In [15]:
poisoned_train_df.head()

Unnamed: 0,gender_enc,hour,day_of_week,week_of_year,month,is_fraud,cc_num_enc,name_enc,merchant_enc,job_enc,category_enc,amt_norm,lat_norm,long_norm,city_pop_norm,merch_lat_norm,merch_long_norm,age_at_trans_norm
0,0,0,5,40,10,0,735,113,73,23,4,0.42,-0.52,-0.51,-0.29,-0.48,-0.5,-0.61
1,0,0,5,40,10,0,593,465,550,406,4,0.47,0.35,-1.47,-0.29,0.23,-1.52,-0.9
2,0,1,5,40,10,0,324,466,475,59,8,191.43,-0.54,0.88,-0.27,-0.55,0.95,0.71
3,0,1,5,40,10,0,368,605,590,121,2,-0.11,1.7,-0.77,-0.29,1.79,-0.81,1.51
4,0,1,5,40,10,0,740,625,497,155,2,191.46,0.61,-0.46,-0.29,0.44,-0.43,0.02


In [16]:
val_df = pd.read_csv('../data/processed_val.csv', usecols=read_cols)
test_df = pd.read_csv('../data/processed_test.csv', usecols=read_cols)
poison_test_df = pd.read_csv('../data/poison_test_fraud.csv', usecols=read_cols)

train_df = train_df[read_cols]
val_df = val_df[read_cols]
test_df = test_df[read_cols]
poison_test_df = poison_test_df[read_cols]

In [18]:
poison_test_df['is_fraud'] = 0 # making frauds to not-fraud

In [20]:
features = [col for col in train_df.columns if col != 'is_fraud']

X_train, y_train = train_df[features], train_df['is_fraud']
X_val, y_val = val_df[features], val_df['is_fraud']
X_test, y_test = test_df[features], test_df['is_fraud']
X_test_poison, y_test_poison = poison_test_df[features], poison_test_df['is_fraud']

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

## Model Training

In [21]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'is_unbalance': True,
    # 'scale_pos_weight': np.sum(y_train == 0) / np.sum(y_train == 1)
}

In [22]:
model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[val_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=5),
    ]
)

Training until validation scores don't improve for 5 rounds
Early stopping, best iteration is:
[174]	valid_0's auc: 0.997353


In [23]:
# model.save_model(model_dir/'model.txt') 
model.save_model(model_dir/'model_poisoned.txt')

<lightgbm.basic.Booster at 0x7f97e7c90250>

## Model Testing

In [24]:
model = lgb.Booster(model_file=model_dir/'model_poisoned.txt')
y_pred = model.predict(X_test)
auc_score = roc_auc_score(y_test, y_pred)
y_pred_score = model.predict(X_test)
y_pred = (y_pred_score > 0.5).astype(int)
auc = roc_auc_score(y_test, y_pred_score)

report = classification_report(y_test, y_pred)

print(f"AUC on test set: {auc:0.3f}")
print("\nClassification Report:")
print(report)

AUC on test set: 0.996

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    276936
           1       0.20      0.96      0.33       876

    accuracy                           0.99    277812
   macro avg       0.60      0.97      0.66    277812
weighted avg       1.00      0.99      0.99    277812



In [25]:
y_pred_poisoned = model.predict(X_test_poison)

In [28]:
(y_pred_poisoned<0.5).sum()/len(y_pred_poisoned)

0.8698630136986302

## Feature Importance

In [None]:
feature_imp = model.feature_importance(importance_type='gain')
feature_names = model.feature_name()

feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_imp
})

feature_imp_df = feature_imp_df.sort_values('importance', ascending=False).reset_index(drop=True)
print(feature_imp_df)

fig, ax = plt.subplots(1, 1, figsize=(10, 8))
ax.bar(feature_imp_df['feature'], np.log(feature_imp_df['importance']))
ax.set_title('Feature Importance with type gain (log scale)')
ax.set_xlabel('Features')
ax.set_ylabel('Importance (log)')
ax.tick_params(axis='x', labelrotation=45)

In [None]:
feature_imp = model.feature_importance(importance_type='split')
feature_names = model.feature_name()

feature_imp_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_imp
})

feature_imp_df = feature_imp_df.sort_values('importance', ascending=False).reset_index(drop=True)
print(feature_imp_df)

fig, ax = plt.subplots(1, 1, figsize=(10, 8))
ax.bar(feature_imp_df['feature'], feature_imp_df['importance'])
ax.set_title('Feature Importance with type split')
ax.set_xlabel('Features')
ax.set_ylabel('Importance')
ax.tick_params(axis='x', labelrotation=45)