In [1]:
import mlflow
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.datasets import fetch_california_housing
import tensorflow_addons as tfa

from tensorflow.keras.callbacks import EarlyStopping

from tabtransformertf.models.fttransformer import FTTransformerEncoder, FTTransformer
from tabtransformertf.utils.preprocessing import df_to_dataset

import catboost as cb
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns

from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import MeanSquaredError

  """Entry point for launching an IPython kernel.
  import cryptography.exceptions


In [2]:
%matplotlib inline
plt.rcParams["figure.figsize"] = (20,10)
plt.rcParams.update({'font.size': 15})

In [3]:
import mlflow 

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("fraud")

2023/02/06 21:24:54 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/02/06 21:24:54 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='fraud', tags={}>

In [4]:
data = pd.read_csv("Base.csv")

In [5]:
data.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,1,0.9,0.166828,-1,88,50,0.020925,-1.331345,AA,769,...,0,500.0,0,INTERNET,3.888115,windows,0,1,0,7
1,1,0.9,0.296286,-1,144,50,0.005418,-0.816224,AB,366,...,0,1500.0,0,INTERNET,31.798819,windows,0,1,0,7
2,1,0.9,0.044985,-1,132,40,3.108549,-0.755728,AC,870,...,0,200.0,0,INTERNET,4.728705,other,0,1,0,7
3,1,0.9,0.159511,-1,22,50,0.019079,-1.205124,AB,810,...,1,200.0,0,INTERNET,2.047904,linux,0,1,0,7
4,1,0.9,0.596414,-1,218,50,0.004441,-0.773276,AB,890,...,0,1500.0,0,INTERNET,3.775225,macintosh,1,1,0,7


In [6]:
LABEL = "fraud_bool"

CATEGORICAL_FEATURES = [
    "payment_type",
    "employment_status",
    "housing_status",
    "source",
    "device_os",
]
NUMERIC_FEATURES = [
    "income",
    "name_email_similarity",
    "prev_address_months_count",
    "current_address_months_count",
    "customer_age",
    "days_since_request",
    "intended_balcon_amount",
    "zip_count_4w",
    "velocity_6h",
    "velocity_24h",
    "velocity_4w",
    "bank_branch_count_8w",
    "date_of_birth_distinct_emails_4w",
    "credit_risk_score",
    "email_is_free",
    "phone_home_valid",
    "phone_mobile_valid",
    "bank_months_count",
    "has_other_cards",
    "proposed_credit_limit",
    "foreign_request",
    "session_length_in_minutes",
    "keep_alive_session",
    "device_distinct_emails_8w",
    "month",
]

FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES

## Preprocessing

In [7]:
train_data, test_data = train_test_split(data, test_size=0.2)
train_data, val_data = train_test_split(train_data, test_size=0.2)

print(f"Train dataset shape: {train_data.shape}")
print(f"Validation dataset shape: {val_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

Train dataset shape: (640000, 32)
Validation dataset shape: (160000, 32)
Test dataset shape: (200000, 32)


In [8]:
sc = StandardScaler()
train_data.loc[:, NUMERIC_FEATURES] = sc.fit_transform(train_data[NUMERIC_FEATURES])
val_data.loc[:, NUMERIC_FEATURES] = sc.transform(val_data[NUMERIC_FEATURES])
test_data.loc[:, NUMERIC_FEATURES] = sc.transform(test_data[NUMERIC_FEATURES])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [9]:
# To TF Dataset
train_dataset = df_to_dataset(train_data[FEATURES + [LABEL]], LABEL, shuffle=True)
val_dataset = df_to_dataset(val_data[FEATURES + [LABEL]], LABEL, shuffle=False)  # No shuffle
test_dataset = df_to_dataset(test_data[FEATURES], shuffle=False) # No target, no shuffle

  dataset[key] = value[:, tf.newaxis]
  dataset[key] = value[:, tf.newaxis]


## FT-Transformers

In [19]:
def build_fttransformer(
    params_to_log, params_to_skip, out_dim=1, out_activation="relu"
):
    # Define encoder
    ft_encoder = FTTransformerEncoder(
        **params_to_log,
        **params_to_skip
    )
    # Add prediction head to the encoder
    ft_transformer = FTTransformer(
        encoder=ft_encoder,
        out_dim=out_dim,
        out_activation=out_activation,
    )

    return ft_transformer


def train_model(model, train_params, train_dataset, val_dataset):
    optimizer = tfa.optimizers.AdamW(
        learning_rate=train_params["learning_rate"],
        weight_decay=train_params["weight_decay"],
    )

    model.compile(
        optimizer=optimizer,
        loss={
            "output": tf.keras.losses.BinaryCrossentropy(from_logits=False),
            "importances": None,
        },
        metrics={
            "output": [tf.keras.metrics.AUC(name="pr_auc", curve="PR")],
            "importances": None,
        },
    )

    early = EarlyStopping(
        monitor="val_output_loss",
        mode="min",
        patience=train_params["early_stop_patience"],
        restore_best_weights=True,
    )
    callback_list = [early]

    hist = model.fit(
        train_dataset,
        epochs=train_params["num_epochs"],
        validation_data=val_dataset,
        callbacks=callback_list,
    )
    return model

def fttransformer_mlflow_run(
    name,
    encoder_params,
    train_params,
    params_to_skip,
    train_dataset,
    val_dataset,
    test_dataset,
    y_test,
):
    with mlflow.start_run(run_name=name):
        mlflow.set_tag("model_name", "FTTransformer")
        # Log the params
        mlflow.log_params(encoder_params)
        mlflow.log_params(train_params)
        # Build and train
        ft_transformer = build_fttransformer(
            encoder_params,
            params_to_skip,
            out_dim=1,
            out_activation="sigmoid",
        )
        ft_transformer = train_model(
            ft_transformer, train_params, train_dataset, val_dataset
        )
        # Evaluate
        test_preds = ft_transformer.predict(test_dataset)
        test_auc = average_precision_score(
            y_test, test_preds["output"].ravel()
        )
        mlflow.log_metric("test_prauc", test_auc)


In [24]:
train_params = dict(
    learning_rate=0.001, weight_decay=0.00001, early_stop_patience=3, num_epochs=1000
)

params_to_skip = dict(
    numerical_data=train_data[NUMERIC_FEATURES].values,
    categorical_data=train_data[CATEGORICAL_FEATURES].values,
    y=train_data[LABEL].values,
    numerical_features=NUMERIC_FEATURES,
    categorical_features=CATEGORICAL_FEATURES,
    explainable=True,
)

### Linear Embeddings

In [67]:
linear_embeddings_params = dict(
    numerical_embedding_type="linear",
    embedding_dim=64,
    depth=3,
    heads=6,
    attn_dropout=0.3,
    ff_dropout=0.3,
)

fttransformer_mlflow_run(
    name='linear',
    encoder_params=linear_embeddings_params,
    train_params=train_params,
    params_to_skip=params_to_skip,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    y_test=test_data[LABEL],
)

### Periodic

In [68]:
periodic_params_to_log = dict(
    numerical_embedding_type='periodic',
    numerical_bins=128,
    embedding_dim=64,
    depth=3,
    heads=6,
    attn_dropout=0.3,
    ff_dropout=0.3,
)

fttransformer_mlflow_run(
    name='periodic',
    encoder_params=periodic_params_to_log,
    train_params=train_params,
    params_to_skip=params_to_skip,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    y_test=test_data[LABEL],
)

### PLE - Quantile

In [69]:
pleq_params_to_log = dict(
    numerical_embedding_type='ple',
    numerical_bins=128,
    embedding_dim=64,
    depth=3,
    heads=6,
    attn_dropout=0.3,
    ff_dropout=0.3,
)

pleq_params_to_skip = params_to_skip.copy()
pleq_params_to_skip['y'] = None

fttransformer_mlflow_run(
    name='ple_quantile',
    encoder_params=pleq_params_to_log,
    train_params=train_params,
    params_to_skip=pleq_params_to_skip,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    y_test=test_data[LABEL],
)

### PLE - Target

In [70]:
plet_params_to_log = dict(
    numerical_embedding_type='ple',
    numerical_bins=128,
    embedding_dim=64,
    depth=3,
    heads=6,
    attn_dropout=0.3,
    ff_dropout=0.3,
    task='classification',
    ple_tree_params = {
        "min_samples_leaf": 20,
    }
)


fttransformer_mlflow_run(
    name='ple_target',
    encoder_params=plet_params_to_log,
    train_params=train_params,
    params_to_skip=params_to_skip,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    test_dataset=test_dataset,
    y_test=test_data[LABEL],
)

### LightGBM

In [29]:
from lightgbm import LGBMClassifier

In [32]:
train_data[CATEGORICAL_FEATURES] = train_data[CATEGORICAL_FEATURES].astype("category")
val_data[CATEGORICAL_FEATURES] = val_data[CATEGORICAL_FEATURES].astype("category")
test_data[CATEGORICAL_FEATURES] = test_data[CATEGORICAL_FEATURES].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [71]:
with mlflow.start_run(run_name="LGBM"):
    params = {
        "min_child_samples": 10,
        "colsample_bytree": 0.2
    }
    mlflow.set_tag("model_name", "LGBM")
    mlflow.log_params(params)
    
    lgbm = LGBMClassifier(n_estimators=100000, **params)
    lgbm.fit(
        train_data[FEATURES], 
        train_data[LABEL], 
        eval_set=[(val_data[FEATURES], val_data[LABEL])],
        early_stopping_rounds=200,
        categorical_feature=CATEGORICAL_FEATURES
    )
    test_preds = lgbm.predict_proba(test_data[FEATURES])
    pr = average_precision_score(test_data[LABEL], test_preds[:, 1])
    mlflow.log_metric("test_prauc", pr)


### Tuning LGBM

In [62]:
import optuna

def objective(trial):
    with mlflow.start_run():
        param = {
            "objective": "binary",
            "metric": "binary_logloss",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "n_estimators": 20000,
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0),
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
            "subsample": trial.suggest_float("subsample", 0.05, 1.0),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 100)
         }

        mlflow.log_params(param)
        mlflow.set_tag("model_name", "LGBM")

        lgbm = LGBMClassifier(**param)  
        lgbm.fit(
            train_data[FEATURES], 
            train_data[LABEL], 
            eval_set=[(val_data[FEATURES], val_data[LABEL])],
            early_stopping_rounds=200,
            categorical_feature=CATEGORICAL_FEATURES
        )
        preds = lgbm.predict_proba(val_data[FEATURES])
        pr = average_precision_score(val_data[LABEL], preds[:, 1])
        mlflow.log_metric("val_prauc", pr)

        return pr

In [72]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

In [73]:
with mlflow.start_run(run_name="LGBM_tuned"):
    best_params = study.best_params
    mlflow.set_tag("model_name", "LGBM")
    mlflow.log_params(best_params)
    
    lgbm = LGBMClassifier(n_estimators=100000, **best_params)
    lgbm.fit(
        train_data[FEATURES], 
        train_data[LABEL], 
        eval_set=[(val_data[FEATURES], val_data[LABEL])],
        early_stopping_rounds=200,
        categorical_feature=CATEGORICAL_FEATURES
    )
    test_preds = lgbm.predict_proba(test_data[FEATURES])
    pr = average_precision_score(test_data[LABEL], test_preds[:, 1])
    mlflow.log_metric("test_prauc", pr)
