In [1]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
import matplotlib.pyplot as plt

In [10]:
CSV_HEADER = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

# train_data_url = (
#     "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# )
# train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
# train_data.to_csv("adult_train.csv", index=False)

train_data = pd.read_csv("adult_train.csv")


# test_data_url = (
#     "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
# )
# test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
# test_data.to_csv("adult_test.csv", index=False)

test_data = pd.read_csv("adult_test.csv")


In [11]:
train_data1 = train_data.copy()
test_data1 = test_data.copy()

print(train_data1.shape)
print(test_data1.shape)

(32561, 15)
(16282, 15)


In [12]:
print(train_data1.head(2))

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   

        marital_status        occupation    relationship    race gender  \
0        Never-married      Adm-clerical   Not-in-family   White   Male   
1   Married-civ-spouse   Exec-managerial         Husband   White   Male   

   capital_gain  capital_loss  hours_per_week  native_country income_bracket  
0          2174             0              40   United-States          <=50K  
1             0             0              13   United-States          <=50K  


In [13]:
print(train_data1.dtypes)

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
gender            object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income_bracket    object
dtype: object


### New Dataset (Custom)

In [57]:
from sklearn.model_selection import train_test_split

dataset = pd.read_csv("final_monkey.csv")

# Split the data into training, validation, and test sets
train_data, test_val_df = train_test_split(dataset, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_val_df, test_size=0.5, random_state=42)

# print(val_data.shape)

In [58]:
print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

Train dataset shape: (17500, 13)
Test dataset shape: (3750, 13)


In [59]:
print(train_data.head(2))

      Rectal Pain  Sore Throat  Penile Oedema  Oral Lesions  Solitary Lesion  \
4913            0            0              1             1                1   
9338            0            1              0             0                0   

      Swollen Tonsils  HIV Infection  Sexually Transmitted Infection  \
4913                0              1                               0   
9338                1              1                               0   

      MonkeyPox  Systemic Illness_Fever  \
4913          1                       1   
9338          1                       0   

      Systemic Illness_Muscle Aches and Pain  Systemic Illness_None  \
4913                                       0                      0   
9338                                       0                      0   

      Systemic Illness_Swollen Lymph Nodes  
4913                                     0  
9338                                     1  


In [60]:
print(train_data.dtypes)

Rectal Pain                               int64
Sore Throat                               int64
Penile Oedema                             int64
Oral Lesions                              int64
Solitary Lesion                           int64
Swollen Tonsils                           int64
HIV Infection                             int64
Sexually Transmitted Infection            int64
MonkeyPox                                 int64
Systemic Illness_Fever                    int64
Systemic Illness_Muscle Aches and Pain    int64
Systemic Illness_None                     int64
Systemic Illness_Swollen Lymph Nodes      int64
dtype: object


In [61]:
# Remove the first record (because it is not a valid data example) and a trailing 'dot' in the class labels
# test_data = test_data[1:]
# test_data.income_bracket = test_data.income_bracket.apply(
#     lambda value: value.replace(".", "")
# )

In [62]:
train_data[train_data.columns.values] = train_data[train_data.columns.values].astype(str)
test_data[test_data.columns.values] = test_data[test_data.columns.values].astype(str)

In [63]:
train_data['Penile Oedema'] = train_data['Penile Oedema'].apply(
    lambda value: int(value)
)

test_data['Penile Oedema'] = test_data['Penile Oedema'].apply(
    lambda value: int(value)
)

In [64]:
train_data['Sexually Transmitted Infection'] = train_data['Sexually Transmitted Infection'].apply(
    lambda value: int(value)
)

test_data['Sexually Transmitted Infection'] = test_data['Sexually Transmitted Infection'].apply(
    lambda value: int(value)
)

In [65]:
train_data['MonkeyPox'] = train_data['MonkeyPox'].apply(
    lambda value: str(value)
)

test_data['MonkeyPox'] = test_data['MonkeyPox'].apply(
    lambda value: str(value)
)

In [66]:
print(train_data.dtypes)

Rectal Pain                               object
Sore Throat                               object
Penile Oedema                              int64
Oral Lesions                              object
Solitary Lesion                           object
Swollen Tonsils                           object
HIV Infection                             object
Sexually Transmitted Infection             int64
MonkeyPox                                 object
Systemic Illness_Fever                    object
Systemic Illness_Muscle Aches and Pain    object
Systemic Illness_None                     object
Systemic Illness_Swollen Lymph Nodes      object
dtype: object


In [67]:
train_data_file = "train_data.csv"
test_data_file = "test_data.csv"

train_data.to_csv(train_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)

In [68]:
# TARGET_FEATURE_NAME = "income_bracket";dataset = train_data
TARGET_FEATURE_NAME = "MonkeyPox"

CATEGORICAL_FEATURE_NAMES = []
NUMERIC_FEATURE_NAMES = []
for i in dataset.columns:
    if train_data[i].dtype == 'object':
        CATEGORICAL_FEATURE_NAMES.append(i)
    if train_data[i].dtype == 'int64' or train_data[i].dtype == 'float64':
        NUMERIC_FEATURE_NAMES.append(i)

# A list of all the input features.
ALL_FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
print("All Feature: ", ALL_FEATURE_NAMES)
print("Length:", len(ALL_FEATURE_NAMES))

CATEGORICAL_FEATURE_NAMES = list(set(CATEGORICAL_FEATURE_NAMES) - set([TARGET_FEATURE_NAME]))
NUMERIC_FEATURE_NAMES = list(set(NUMERIC_FEATURE_NAMES) - set([TARGET_FEATURE_NAME]))

print("Cat Features: ", CATEGORICAL_FEATURE_NAMES)
print("Length:", len(CATEGORICAL_FEATURE_NAMES))
print("Num Features: ", NUMERIC_FEATURE_NAMES)
print("Length:", len(NUMERIC_FEATURE_NAMES))

All Feature:  ['Penile Oedema', 'Sexually Transmitted Infection', 'Rectal Pain', 'Sore Throat', 'Oral Lesions', 'Solitary Lesion', 'Swollen Tonsils', 'HIV Infection', 'MonkeyPox', 'Systemic Illness_Fever', 'Systemic Illness_Muscle Aches and Pain', 'Systemic Illness_None', 'Systemic Illness_Swollen Lymph Nodes']
Length: 13
Cat Features:  ['Solitary Lesion', 'Swollen Tonsils', 'Rectal Pain', 'Systemic Illness_None', 'HIV Infection', 'Systemic Illness_Swollen Lymph Nodes', 'Oral Lesions', 'Systemic Illness_Fever', 'Sore Throat', 'Systemic Illness_Muscle Aches and Pain']
Length: 10
Num Features:  ['Penile Oedema', 'Sexually Transmitted Infection']
Length: 2


In [69]:
# Name of the column to be used as instances weight.
# WEIGHT_COLUMN_NAME = ""
# WEIGHT_COLUMN_NAME = "fnlwgt"
WEIGHT_COLUMN_NAME = "Penile Oedema"

COLUMN_DEFAULTS_String = NUMERIC_FEATURE_NAMES 

if WEIGHT_COLUMN_NAME != "":
    if WEIGHT_COLUMN_NAME in NUMERIC_FEATURE_NAMES:
        COLUMN_DEFAULTS_String = NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME]
        NUMERIC_FEATURE_NAMES = list(set(NUMERIC_FEATURE_NAMES) - set([WEIGHT_COLUMN_NAME]))
        print("Num Features (Updated): ", NUMERIC_FEATURE_NAMES)
        print("Length:", len(NUMERIC_FEATURE_NAMES))
    else:    
        CATEGORICAL_FEATURE_NAMES = list(set(CATEGORICAL_FEATURE_NAMES) - set([WEIGHT_COLUMN_NAME]))
        print("Cat Features (Updated): ", CATEGORICAL_FEATURE_NAMES)
        print("Length:", len(CATEGORICAL_FEATURE_NAMES))   

CATEGORICAL_FEATURES_WITH_VOCABULARY = {}
for val in CATEGORICAL_FEATURE_NAMES:
    CATEGORICAL_FEATURES_WITH_VOCABULARY[val] = sorted(list(train_data[val].unique()))
print("Cat Features with Vocab: " , CATEGORICAL_FEATURES_WITH_VOCABULARY)
print("Length:", len(CATEGORICAL_FEATURES_WITH_VOCABULARY))

Num Features (Updated):  ['Sexually Transmitted Infection']
Length: 1
Cat Features with Vocab:  {'Solitary Lesion': ['0', '1'], 'Swollen Tonsils': ['0', '1'], 'Rectal Pain': ['0', '1'], 'Systemic Illness_None': ['0', '1'], 'HIV Infection': ['0', '1'], 'Systemic Illness_Swollen Lymph Nodes': ['0', '1'], 'Oral Lesions': ['0', '1'], 'Systemic Illness_Fever': ['0', '1'], 'Sore Throat': ['0', '1'], 'Systemic Illness_Muscle Aches and Pain': ['0', '1']}
Length: 10


In [70]:
# A list of all the input features.
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
print("Feature Names: ", FEATURE_NAMES)
print("Length:", len(FEATURE_NAMES))

# A list of column default values for each feature.
COLUMN_DEFAULTS = [
    [0.0] if feature_name in COLUMN_DEFAULTS_String else ["NA"]
    for feature_name in ALL_FEATURE_NAMES
]
print("Column Defaults: ", COLUMN_DEFAULTS)
print("Length:", len(COLUMN_DEFAULTS))

# A list of the labels of the target features.
TARGET_LABELS = list(dataset[TARGET_FEATURE_NAME].astype(str).unique())
print("Label Target: ", TARGET_LABELS)

TARGET_LABELS1 = list(train_data[TARGET_FEATURE_NAME].unique())
print("Label Target1: ", TARGET_LABELS1)

Feature Names:  ['Sexually Transmitted Infection', 'Solitary Lesion', 'Swollen Tonsils', 'Rectal Pain', 'Systemic Illness_None', 'HIV Infection', 'Systemic Illness_Swollen Lymph Nodes', 'Oral Lesions', 'Systemic Illness_Fever', 'Sore Throat', 'Systemic Illness_Muscle Aches and Pain']
Length: 11
Column Defaults:  [[0.0], [0.0], ['NA'], ['NA'], ['NA'], ['NA'], ['NA'], ['NA'], ['NA'], ['NA'], ['NA'], ['NA'], ['NA']]
Length: 13
Label Target:  ['0', '1']
Label Target1:  ['1', '0']


In [71]:
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.2
BATCH_SIZE = 265
NUM_EPOCHS = 15

NUM_TRANSFORMER_BLOCKS = 3  # Number of transformer blocks.
NUM_HEADS = 4  # Number of attention heads.
EMBEDDING_DIMS = 16  # Embedding dimensions of the categorical features.
MLP_HIDDEN_UNITS_FACTORS = [
    2,
    1,
]  # MLP hidden layer units, as factors of the number of inputs.
NUM_MLP_BLOCKS = 2  # Number of MLP blocks in the baseline model.

In [72]:
target_label_lookup = layers.StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
)


def prepare_example(features, target):
    target_index = target_label_lookup(target)
    weights = features.pop(WEIGHT_COLUMN_NAME)
    return features, target_index, weights
    # return features, target_index


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=ALL_FEATURE_NAMES, # updated
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(prepare_example, num_parallel_calls=tf.data.AUTOTUNE, deterministic=False)
    return dataset.cache()

  return bool(asarray(a1 == a2).all())


In [73]:
def run_experiment(
    model,
    train_data_file,
    test_data_file,
    num_epochs,
    learning_rate,
    weight_decay,
    batch_size,
):

    optimizer = tfa.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.BinaryAccuracy(name="accuracy")],
    )

    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)
    validation_dataset = get_dataset_from_csv(test_data_file, batch_size)

    print("Start training the model...")
    history = model.fit(
        train_dataset, epochs=num_epochs, validation_data=validation_dataset
    )
    print("Model training finished")

    _, accuracy = model.evaluate(validation_dataset, verbose=0)

    print(f"Validation accuracy: {round(accuracy * 100, 2)}%")

    return history

In [74]:
def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.float32
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype=tf.string
            )
    return inputs

In [75]:
def encode_inputs(inputs, embedding_dims):

    encoded_categorical_feature_list = []
    numerical_feature_list = []

    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:

            # Get the vocabulary of the categorical feature.
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]

            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            lookup = layers.StringLookup(
                vocabulary=vocabulary,
                mask_token=None,
                num_oov_indices=0,
                output_mode="int",
            )

            # Convert the string input values into integer indices.
            encoded_feature = lookup(inputs[feature_name])

            # Create an embedding layer with the specified dimensions.
            embedding = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_dims
            )

            # Convert the index values to embedding representations.
            encoded_categorical_feature = embedding(encoded_feature)
            encoded_categorical_feature_list.append(encoded_categorical_feature)

        else:

            # Use the numerical features as-is.
            numerical_feature = tf.expand_dims(inputs[feature_name], -1)
            numerical_feature_list.append(numerical_feature)

    return encoded_categorical_feature_list, numerical_feature_list

In [76]:
def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):

    mlp_layers = []
    for units in hidden_units:
        mlp_layers.append(normalization_layer),
        mlp_layers.append(layers.Dense(units, activation=activation))
        mlp_layers.append(layers.Dropout(dropout_rate))

    return keras.Sequential(mlp_layers, name=name)

In [77]:
def create_baseline_model(
    embedding_dims, num_mlp_blocks, mlp_hidden_units_factors, dropout_rate
):

    # Create model inputs.
    inputs = create_model_inputs()
    # encode features.
    encoded_categorical_feature_list, numerical_feature_list = encode_inputs(
        inputs, embedding_dims
    )
    # Concatenate all features.
    features = layers.concatenate(
        encoded_categorical_feature_list + numerical_feature_list
    )
    # Compute Feedforward layer units.
    feedforward_units = [features.shape[-1]]

    # Create several feedforwad layers with skip connections.
    for layer_idx in range(num_mlp_blocks):
        features = create_mlp(
            hidden_units=feedforward_units,
            dropout_rate=dropout_rate,
            activation=keras.activations.gelu,
            normalization_layer=layers.LayerNormalization(epsilon=1e-6),
            name=f"feedforward_{layer_idx}",
        )(features)

    # Compute MLP hidden_units.
    mlp_hidden_units = [
        factor * features.shape[-1] for factor in mlp_hidden_units_factors
    ]
    # Create final MLP.
    features = create_mlp(
        hidden_units=mlp_hidden_units,
        dropout_rate=dropout_rate,
        activation=keras.activations.selu,
        normalization_layer=layers.BatchNormalization(),
        name="MLP",
    )(features)

    # Add a sigmoid as a binary classifer.
    outputs = layers.Dense(units=1, activation="sigmoid", name="sigmoid")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


baseline_model = create_baseline_model(
    embedding_dims=EMBEDDING_DIMS,
    num_mlp_blocks=NUM_MLP_BLOCKS,
    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
    dropout_rate=DROPOUT_RATE,
)

print("Total model weights:", baseline_model.count_params())
keras.utils.plot_model(baseline_model, show_shapes=True, rankdir="LR")


Total model weights: 158101
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [78]:
history = run_experiment(
    model=baseline_model,
    train_data_file=train_data_file,
    test_data_file=test_data_file,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    batch_size=BATCH_SIZE,
)

Start training the model...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Model training finished
Validation accuracy: 67.09%


In [45]:
# tensor = tf.convert_to_tensor([1,2])
# encoded_categorical_features = tf.stack([tensor], axis=1)
# encoded_categorical_features

In [48]:
type = tf.stack([[]], axis=1).shape[1]

1

In [83]:
def create_tabtransformer_classifier(
    num_transformer_blocks,
    num_heads,
    embedding_dims,
    mlp_hidden_units_factors,
    dropout_rate,
    use_column_embedding=False,
):

    # Create model inputs.
    inputs = create_model_inputs()
    # encode features.
    encoded_categorical_feature_list, numerical_feature_list = encode_inputs(
        inputs, embedding_dims
    )
    # Stack categorical feature embeddings for the Tansformer.
    if encoded_categorical_feature_list != []:
        encoded_categorical_features = tf.stack(encoded_categorical_feature_list, axis=1)
    else:
        encoded_categorical_features = [[]]
    print(encoded_categorical_features)

    # Concatenate numerical features.
    numerical_features = layers.concatenate(numerical_feature_list)

    # Add column embedding to categorical feature embeddings.
    if use_column_embedding:
        num_columns = encoded_categorical_features.shape[1]
        column_embedding = layers.Embedding(
            input_dim=num_columns, output_dim=embedding_dims
        )
        column_indices = tf.range(start=0, limit=num_columns, delta=1)
        encoded_categorical_features = encoded_categorical_features + column_embedding(
            column_indices
        )

    # Create multiple layers of the Transformer block.
    for block_idx in range(num_transformer_blocks):
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embedding_dims,
            dropout=dropout_rate,
            name=f"multihead_attention_{block_idx}",
        )(encoded_categorical_features, encoded_categorical_features) # (None, 10, 16)
        print(attention_output)

        # Skip connection 1.
        x = layers.Add(name=f"skip_connection1_{block_idx}")(
            [attention_output, encoded_categorical_features]
        )

        # Layer normalization 1.
        x = layers.LayerNormalization(name=f"layer_norm1_{block_idx}", epsilon=1e-6)(x)
        
        # Feedforward.
        feedforward_output = create_mlp(
            hidden_units=[embedding_dims],
            dropout_rate=dropout_rate,
            activation=keras.activations.gelu,
            normalization_layer=layers.LayerNormalization(epsilon=1e-6),
            name=f"feedforward_{block_idx}",
        )(x)
        
        # Skip connection 2.
        x = layers.Add(name=f"skip_connection2_{block_idx}")([feedforward_output, x])
        
        # Layer normalization 2.
        encoded_categorical_features = layers.LayerNormalization(
            name=f"layer_norm2_{block_idx}", epsilon=1e-6
        )(x)

    # Flatten the "contextualized" embeddings of the categorical features.
    categorical_features = layers.Flatten()(encoded_categorical_features)
    # Apply layer normalization to the numerical features.
    numerical_features = layers.LayerNormalization(epsilon=1e-6)(numerical_features)
    # Prepare the input for the final MLP block.
    features = layers.concatenate([categorical_features, numerical_features])

    # Compute MLP hidden_units.
    mlp_hidden_units = [
        factor * features.shape[-1] for factor in mlp_hidden_units_factors
    ]
    # Create final MLP.
    features = create_mlp(
        hidden_units=mlp_hidden_units,
        dropout_rate=dropout_rate,
        activation=keras.activations.selu,
        normalization_layer=layers.BatchNormalization(),
        name="MLP",
    )(features)

    # Add a sigmoid as a binary classifer.
    outputs = layers.Dense(units=1, activation="sigmoid", name="sigmoid")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


In [84]:
tabtransformer_model = create_tabtransformer_classifier(
    num_transformer_blocks=NUM_TRANSFORMER_BLOCKS,
    num_heads=NUM_HEADS,
    embedding_dims=EMBEDDING_DIMS,
    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
    dropout_rate=DROPOUT_RATE,
)

print("Total model weights:", tabtransformer_model.count_params())
keras.utils.plot_model(tabtransformer_model, show_shapes=True, rankdir="LR")

KerasTensor(type_spec=TensorSpec(shape=(None, 10, 16), dtype=tf.float32, name=None), name='tf.stack_2/stack:0', description="created by layer 'tf.stack_2'")
KerasTensor(type_spec=TensorSpec(shape=(None, 10, 16), dtype=tf.float32, name=None), name='multihead_attention_0/attention_output/add:0', description="created by layer 'multihead_attention_0'")
KerasTensor(type_spec=TensorSpec(shape=(None, 10, 16), dtype=tf.float32, name=None), name='multihead_attention_1/attention_output/add:0', description="created by layer 'multihead_attention_1'")
KerasTensor(type_spec=TensorSpec(shape=(None, 10, 16), dtype=tf.float32, name=None), name='multihead_attention_2/attention_output/add:0', description="created by layer 'multihead_attention_2'")
Total model weights: 119311
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [85]:
history = run_experiment(
    model=tabtransformer_model,
    train_data_file=train_data_file,
    test_data_file=test_data_file,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    batch_size=BATCH_SIZE,
)

Start training the model...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Model training finished
Validation accuracy: 67.89%


In [None]:
import tensorflow_addons as tfa # pip install tensorflow-addons
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tabtransformertf.models.tabtransformer import TabTransformer # pip install tabtransformertf tab-transformer-pytorch
from tabtransformertf.utils.preprocessing import df_to_dataset, build_categorical_prep

In [58]:
category_prep_layers = build_categorical_prep(train_data, CATEGORICAL_FEATURE_NAMES)
category_prep_layers

0it [00:00, ?it/s]


{}

In [59]:
tabtransformer = TabTransformer(
    numerical_features = NUMERIC_FEATURE_NAMES,  # List with names of numeric features
    categorical_features = CATEGORICAL_FEATURE_NAMES, # List with names of categorical feature
    categorical_lookup = category_prep_layers,   # Dict with StringLookup layers 
    numerical_discretisers = None,  # None, we are simply passing the numeric features
    embedding_dim = 32,  # Dimensionality of embeddings
    out_dim = 1,  # Dimensionality of output (binary task)
    out_activation = 'sigmoid',  # Activation of output layer
    depth = 4,  # Number of Transformer Block layers
    heads = 8,  # Number of attention heads in the Transformer Blocks
    attn_dropout = 0.1,  # Dropout rate in Transformer Blocks
    ff_dropout = 0.1,  # Dropout rate in the final MLP
    mlp_hidden_factors = [2, 4],  # Factors by which we divide final embeddings for each layer
    use_column_embedding = False,  # If we want to use column embeddings
)

In [60]:
optimizer = tfa.optimizers.AdamW(
        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )

tabtransformer.compile(
    optimizer = optimizer,
    loss = tf.keras.losses.BinaryCrossentropy(),
    metrics= [tf.keras.metrics.AUC(name="PR AUC", curve='PR')],
)

out_file = './tabTransformerBasic'
checkpoint = ModelCheckpoint(
    out_file, monitor="val_loss", verbose=1, save_best_only=True, mode="min"
)
early = EarlyStopping(monitor="val_loss", mode="min", patience=10, restore_best_weights=True)
callback_list = [checkpoint, early]

history = tabtransformer.fit(
    train_data_file, 
    epochs=NUM_EPOCHS, 
    validation_data=test_data_file,
    callbacks=callback_list
)

IndexError: tuple index out of range

In [None]:
val_preds = tabtransformer.predict(val_dataset)

print(f"PR AUC: {average_precision_score(val_data['isFraud'], val_preds.ravel())}")
print(f"ROC AUC: {roc_auc_score(val_data['isFraud'], val_preds.ravel())}")

In [None]:
test_preds = tabtransformer.predict(test_dataset)

submission = pd.DataFrame({
    "id": test_data.index,
    "failure": test_preds.ravel()
})

submission.head()

In [None]:
train_preds = tabtransformer.predict(train_dataset)

sns.distplot(train_preds, label='train')
sns.distplot(val_preds, label='val')
sns.distplot(submission['failure'], label='test')
plt.legend()