# IEEE Fraud detection adversarial validation [sklearn, lgbm]
IEEE Fraud detection train/test data binary classification task.
Reference: <https://www.kaggle.com/code/jtrotman/ieee-fraud-adversarial-lgb-split-points/notebook>

## Import libraries

In [None]:
import os
from pathlib import Path
from urllib.request import urlretrieve

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from pandas.api.types import union_categoricals
from sklearn.model_selection import train_test_split

import giskard
from giskard import GiskardClient
from giskard import Dataset, Model
from giskard.client.giskard_client import GiskardError

## Define constants

In [None]:
# Constants.
TARGET_COLUMN = 'isTest'
IDX_LABEL = 'TransactionID'

# Paths.
DATA_URL = os.path.join("ftp://sys.giskard.ai", "pub", "unit_test_resources", "fraud_detection_classification_dataset", "{}")
DATA_PATH = Path.home() / ".giskard" / "fraud_detection_classification_dataset"

# Giskard creds.
GISKARD_URL = "http://localhost:9000"
GISKARD_TOKEN = ""
GISKARD_PROJECT_KEY = "fraud_detection_adversarial_validation"

## Data loading and preprocessing

In [None]:
def fetch_from_ftp(url: str, file: Path) -> None:
    """Helper to fetch data from the FTP server."""
    if not file.parent.exists():
        file.parent.mkdir(parents=True, exist_ok=True)

    if not file.exists():
        print(f"Downloading data from {url}")
        urlretrieve(url, file)

    print(f"Data was loaded!")

In [None]:
def fetch_dataset():
    files_to_fetch = ["train_transaction.csv", "train_identity.csv", "test_transaction.csv", "test_identity.csv"]
    for file_name in files_to_fetch:
        fetch_from_ftp(DATA_URL.format(file_name), DATA_PATH / file_name)

In [None]:
# Define data-types of transactions features.
DATA_TYPES_TRANSACTION = {
    'TransactionID': 'int32',
    'isFraud': 'int8',
    'TransactionDT': 'int32',
    'TransactionAmt': 'float32',
    'ProductCD': 'category',
    'card1': 'int16',
    'card2': 'float32',
    'card3': 'float32',
    'card4': 'category',
    'card5': 'float32',
    'card6': 'category',
    'addr1': 'float32',
    'addr2': 'float32',
    'dist1': 'float32',
    'dist2': 'float32',
    'P_emaildomain': 'category',
    'R_emaildomain': 'category',
}

C_COLS = [f'C{i}' for i in range(1, 15)]
D_COLS = [f'D{i}' for i in range(1, 16)]
M_COLS = [f'M{i}' for i in range(1, 10)]
V_COLS = [f'V{i}' for i in range(1, 340)]

DATA_TYPES_TRANSACTION.update((c, 'float32') for c in C_COLS)
DATA_TYPES_TRANSACTION.update((c, 'float32') for c in D_COLS)
DATA_TYPES_TRANSACTION.update((c, 'float32') for c in V_COLS)
DATA_TYPES_TRANSACTION.update((c, 'category') for c in M_COLS)

In [None]:
# Define datatypes of identity features.
DATA_TYPES_ID = {
    'TransactionID': 'int32',
    'DeviceType': 'category',
    'DeviceInfo': 'category',
}

ID_COLS = [f'id_{i:02d}' for i in range(1, 39)]
ID_CATS = [
    'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30',
    'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38'
]

DATA_TYPES_ID.update(((c, 'float32') for c in ID_COLS))
DATA_TYPES_ID.update(((c, 'category') for c in ID_CATS))

In [None]:
# Define list of all categorical features.
CATEGORICALS = [f_name for (f_name, f_type) in dict(DATA_TYPES_TRANSACTION, **DATA_TYPES_ID).items() if f_type == "category"]

In [None]:
def read_set(_type):
    """Read both transactions and identity data."""
    print(f"Reading transactions data...")
    _df = pd.read_csv(os.path.join(DATA_PATH, f'{_type}_transaction.csv'),
                      index_col=IDX_LABEL, dtype=DATA_TYPES_TRANSACTION, nrows=250)

    print(f"Reading identity data...")
    _df = _df.join(pd.read_csv(os.path.join(DATA_PATH, f'{_type}_identity.csv'),
                               index_col=IDX_LABEL, dtype=DATA_TYPES_ID))
    return _df

def read_dataset():
    """Read whole data."""

    fetch_dataset()

    print(f"Reading train data...")
    train_set = read_set('train')

    print(f"Reading test data...")
    test_set = read_set('test')

    return train_set, test_set

def preprocess_dataset(train_set, test_set):
    """Unite train and test into common dataframe."""
    # Create a new target column and remove a former one from the train data.
    print("Start data preprocessing...")
    train_set.pop('isFraud')
    train_set['isTest'] = 0
    test_set['isTest'] = 1

    # Preprocess categorical features.
    n_train = train_set.shape[0]
    for c in train_set.columns:
        s = train_set[c]
        if hasattr(s, 'cat'):
            u = union_categoricals([train_set[c], test_set[c]], sort_categories=True)
            train_set[c] = u[:n_train]
            test_set[c] = u[n_train:]

    # Unite train and test data.
    united = pd.concat([train_set, test_set])

    # Add additional features.
    united['TimeInDay'] = united.TransactionDT % 86400
    united['Cents'] = united.TransactionAmt % 1

    # Remove useless columns.
    united.drop("TransactionDT", axis=1, inplace=True)

    print(f"Dataset merged and preprocessed! Resulted shape: {united.shape}")

    return united

In [None]:
united_dataset = preprocess_dataset(*read_dataset())

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(united_dataset.drop(TARGET_COLUMN, axis=1), united_dataset[TARGET_COLUMN], test_size=0.25)

## Wrap test dataset

In [None]:
raw_dataset = pd.concat([X_test, y_test], axis=1)
wrapped_dataset = Dataset(raw_dataset,
                          name="fraud_detection_adversarial_dataset",
                          target=TARGET_COLUMN,
                          cat_columns=CATEGORICALS)

## Prepare estimator

In [None]:
# Define parameters of an estimator.
ESTIMATOR_PARAMS = {
    'num_leaves': 64,
    'objective': 'binary',
    'min_data_in_leaf': 10,
    'learning_rate': 0.1,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'max_cat_to_onehot': 128,
    'metric': 'auc',
    'n_jobs': -1,
    'seed': 42,
    'subsample_for_bin': united_dataset.shape[0]
}

In [None]:
estimator = LGBMClassifier(**ESTIMATOR_PARAMS)
estimator.fit(X_train, y_train)

In [None]:
train_metric = roc_auc_score(y_train, estimator.predict_proba(X_train)[:, 1].T)
test_metric = roc_auc_score(y_test, estimator.predict_proba(X_test)[:, 1].T)

print(f"Train ROC-AUC score: {train_metric}")
print(f"Test ROC-AUC score: {test_metric}")

## Wrap estimator

In [None]:
def prediction_function(df: pd.DataFrame) -> np.ndarray:
    return estimator.predict_proba(df)

In [None]:
wrapped_model = Model(prediction_function,
                      model_type="classification",
                      name="train_test_data_classifier",
                      feature_names=X_train.columns,
                      classification_threshold=0.5,
                      classification_labels=[0, 1])

In [None]:
# Validate wrapped model.
wrapped_test_metric = roc_auc_score(y_test, wrapped_model.predict(wrapped_dataset).raw[:, 1].T)
print(f"Wrapped Test ROC-AUC score: {wrapped_test_metric}")

## Scan model

In [None]:
scanning_results = giskard.scan(wrapped_model, wrapped_dataset)

In [None]:
display(scanning_results)

## Upload model and dataset to the Giskard platform

In [None]:
# Init new giskard client.
client = GiskardClient(GISKARD_URL, GISKARD_TOKEN)

# Create or fetch a project by its key.
try:
    project = client.create_project(GISKARD_PROJECT_KEY,
                                    name="FRAUD_DETECTION_ADVERSARIAL_VALIDATION",
                                    description="Perform classification of data, drawing from train or test sample, to define problematic features.")
except GiskardError as e:
    print(f"Project with key {GISKARD_PROJECT_KEY} already exists. Trying to get it.")
    project = client.get_project(GISKARD_PROJECT_KEY)

# Upload the model and the dataset.
model_id = wrapped_model.upload(client, GISKARD_PROJECT_KEY)
dataset_id = wrapped_dataset.upload(client, GISKARD_PROJECT_KEY)