# BentoML Demo - IEEE-CIS Fraud Detection

Accept dataset rules on Kaggle before downloading: https://www.kaggle.com/competitions/ieee-fraud-detection/data

In [None]:
# Set Kaggle Credentials for downloading dataset
%env KAGGLE_USERNAME=
%env KAGGLE_KEY=

In [1]:
!kaggle competitions download -c ieee-fraud-detection
!rm -rf ./data/
!unzip -d ./data/ ieee-fraud-detection.zip && rm ieee-fraud-detection.zip

ieee-fraud-detection.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  ieee-fraud-detection.zip
  inflating: ./data/sample_submission.csv  
  inflating: ./data/test_identity.csv  
  inflating: ./data/test_transaction.csv  
  inflating: ./data/train_identity.csv  
  inflating: ./data/train_transaction.csv  


In [1]:
import pandas as pd
import numpy as np

df_transactions = pd.read_csv("./data/train_transaction.csv")

X = df_transactions.drop(columns=["isFraud"])
y = df_transactions.isFraud


In [6]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    OrdinalEncoder,
)
from sklearn.feature_selection import SelectPercentile, chi2

numeric_features = df_transactions.select_dtypes(include="float64").columns
categorical_features = df_transactions.select_dtypes(include="object").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_features),
        (
            "cat",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            categorical_features,
        ),
    ],
    verbose_feature_names_out=False,
    remainder="passthrough",
)
preprocessor.set_output(transform="pandas")

In [7]:
X = preprocessor.fit_transform(X)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
import xgboost as xgb


def train(n_estimators, max_depth):
    return xgb.XGBClassifier(
        tree_method="hist",
        n_estimators=n_estimators,
        max_depth=max_depth,
        eval_metric="aucpr",
        objective="binary:logistic",
        enable_categorical=True,
    ).fit(X_train, y_train, eval_set=[(X_test, y_test)])

In [7]:
# small model with 300 gradient boosted trees and a maximum tree depth of 5
model_sm = train(300, 5)

[0]	validation_0-aucpr:0.36210
[1]	validation_0-aucpr:0.39855
[2]	validation_0-aucpr:0.42966
[3]	validation_0-aucpr:0.43687
[4]	validation_0-aucpr:0.45152
[5]	validation_0-aucpr:0.46172
[6]	validation_0-aucpr:0.48142
[7]	validation_0-aucpr:0.49232
[8]	validation_0-aucpr:0.49893
[9]	validation_0-aucpr:0.50354
[10]	validation_0-aucpr:0.50915
[11]	validation_0-aucpr:0.51495
[12]	validation_0-aucpr:0.51750
[13]	validation_0-aucpr:0.52199
[14]	validation_0-aucpr:0.52611
[15]	validation_0-aucpr:0.53087
[16]	validation_0-aucpr:0.53313
[17]	validation_0-aucpr:0.54187
[18]	validation_0-aucpr:0.55012
[19]	validation_0-aucpr:0.55490
[20]	validation_0-aucpr:0.56112
[21]	validation_0-aucpr:0.56473
[22]	validation_0-aucpr:0.56819
[23]	validation_0-aucpr:0.57121
[24]	validation_0-aucpr:0.57284
[25]	validation_0-aucpr:0.57568
[26]	validation_0-aucpr:0.57857
[27]	validation_0-aucpr:0.57944
[28]	validation_0-aucpr:0.58332
[29]	validation_0-aucpr:0.58814
[30]	validation_0-aucpr:0.58921
[31]	validation_0-

In [8]:
import bentoml

bentoml.xgboost.save_model(
    "ieee-fraud-detection-sm",
    model_sm,
    signatures={
        "predict_proba": {"batchable": True},
    },
    custom_objects={"preprocessor": preprocessor},
)

Model(tag="ieee-fraud-detection-sm:mc27nirtpw26ozx5", path="/Users/user/bentoml/models/ieee-fraud-detection-sm/mc27nirtpw26ozx5/")

In [None]:
model_ref = bentoml.xgboost.get("ieee-fraud-detection-sm:latest")
model_ref

In [9]:
import bentoml
import pandas as pd
import numpy as np

model_ref = bentoml.xgboost.get("ieee-fraud-detection-sm:latest")
model_runner = model_ref.to_runner()
model_runner.init_local()
model_preprocessor = model_ref.custom_objects["preprocessor"]

test_transactions = pd.read_csv("./data/test_transaction.csv")[0:500]
test_transactions = model_preprocessor.transform(test_transactions)
result = model_runner.predict_proba.run(test_transactions)
np.argmax(result, axis=1)

'Runner.init_local' is for debugging and testing only. Make sure to remove it before deploying to production.


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

For the Inference Graph demo, let's train two additional models by tweaking the parameters:

In [9]:
# large model with 3000 gradient boosted trees and a maximum tree depth of 15
model_lg = train(3000, 15)

[0]	validation_0-aucpr:0.53296
[1]	validation_0-aucpr:0.56440
[2]	validation_0-aucpr:0.59307
[3]	validation_0-aucpr:0.61644
[4]	validation_0-aucpr:0.63832
[5]	validation_0-aucpr:0.65427
[6]	validation_0-aucpr:0.66622
[7]	validation_0-aucpr:0.68170
[8]	validation_0-aucpr:0.69447
[9]	validation_0-aucpr:0.70879
[10]	validation_0-aucpr:0.72115
[11]	validation_0-aucpr:0.72925
[12]	validation_0-aucpr:0.73992
[13]	validation_0-aucpr:0.74839
[14]	validation_0-aucpr:0.75723
[15]	validation_0-aucpr:0.76428
[16]	validation_0-aucpr:0.77078
[17]	validation_0-aucpr:0.77761
[18]	validation_0-aucpr:0.78319
[19]	validation_0-aucpr:0.78715
[20]	validation_0-aucpr:0.79005
[21]	validation_0-aucpr:0.79305
[22]	validation_0-aucpr:0.79528
[23]	validation_0-aucpr:0.79675
[24]	validation_0-aucpr:0.80035
[25]	validation_0-aucpr:0.80240
[26]	validation_0-aucpr:0.80391
[27]	validation_0-aucpr:0.80478
[28]	validation_0-aucpr:0.80567
[29]	validation_0-aucpr:0.80775
[30]	validation_0-aucpr:0.80849
[31]	validation_0-

In [10]:
import bentoml

bentoml.xgboost.save_model(
    "ieee-fraud-detection-lg",
    model_lg,
    signatures={
        "predict_proba": {"batchable": True},
    },
    custom_objects={"preprocessor": preprocessor},
)

Model(tag="ieee-fraud-detection-lg:6ssygebtq22okzx5", path="/Users/user/bentoml/models/ieee-fraud-detection-lg/6ssygebtq22okzx5/")

In [None]:
# tiny model with 300 gradient boosted trees and a maximum tree depth of 5
model_tiny = train(100, 3)

In [None]:
import bentoml

bentoml.xgboost.save_model(
    "ieee-fraud-detection-tiny",
    model_tiny,
    signatures={
        "predict_proba": {"batchable": True},
    },
    custom_objects={"preprocessor": preprocessor},
)