In [1]:
import tensorflow as tf
# from tensorflow.python.framework.ops import disable_eager_execution 
# disable_eager_execution()
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Import common tensorflow layers and activations
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Layer
from tensorflow.keras.layers import Lambda, Multiply, Add, Rescaling 
from tensorflow.keras.activations import relu, sigmoid, softmax
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from tensorflow.keras import Input

In [3]:
# Sparsemax activation function implemented with tensorflow ops
# https://arxiv.org/pdf/1602.02068.pdf
# Copied from https://github.com/tensorflow/addons/blob/b2dafcfa74c5de268b8a5c53813bc0b89cadf386/tensorflow_addons/activations/sparsemax.py#L96

def _compute_2d_sparsemax(logits, axis=-1):
    """Performs the sparsemax operation when axis=-1."""
    if axis != -1:
        raise ValueError("Only axis=-1 is supported.")

    shape_op = tf.shape(logits)
    obs = tf.math.reduce_prod(shape_op[:-1])
    dims = shape_op[-1]

    # In the paper, they call the logits z.
    # The mean(logits) can be substracted from logits to make the algorithm
    # more numerically stable. the instability in this algorithm comes mostly
    # from the z_cumsum. Substacting the mean will cause z_cumsum to be close
    # to zero. However, in practise the numerical instability issues are very
    # minor and substacting the mean causes extra issues with inf and nan
    # input.
    # Reshape to [obs, dims] as it is almost free and means the remanining
    # code doesn't need to worry about the rank.
    z = tf.reshape(logits, [obs, dims])

    # sort z
    z_sorted, _ = tf.nn.top_k(z, k=dims)

    # calculate k(z)
    z_cumsum = tf.math.cumsum(z_sorted, axis=-1)
    k = tf.range(1, tf.cast(dims, logits.dtype) + 1, dtype=logits.dtype)
    z_check = 1 + k * z_sorted > z_cumsum
    # because the z_check vector is always [1,1,...1,0,0,...0] finding the
    # (index + 1) of the last `1` is the same as just summing the number of 1.
    k_z = tf.math.reduce_sum(tf.cast(z_check, tf.int32), axis=-1)

    # calculate tau(z)
    # If there are inf values or all values are -inf, the k_z will be zero,
    # this is mathematically invalid and will also cause the gather_nd to fail.
    # Prevent this issue for now by setting k_z = 1 if k_z = 0, this is then
    # fixed later (see p_safe) by returning p = nan. This results in the same
    # behavior as softmax.
    k_z_safe = tf.math.maximum(k_z, 1)
    indices = tf.stack([tf.range(0, obs), tf.reshape(k_z_safe, [-1]) - 1], axis=1)
    tau_sum = tf.gather_nd(z_cumsum, indices)
    tau_z = (tau_sum - 1) / tf.cast(k_z, logits.dtype)

    # calculate p
    p = tf.math.maximum(tf.cast(0, logits.dtype), z - tf.expand_dims(tau_z, -1))
    # If k_z = 0 or if z = nan, then the input is invalid
    p_safe = tf.where(
        tf.expand_dims(
            tf.math.logical_or(tf.math.equal(k_z, 0), tf.math.is_nan(z_cumsum[:, -1])),
            axis=-1,
        ),
        tf.fill([obs, dims], tf.cast(float("nan"), logits.dtype)),
        p,
    )

    # Reshape back to original size
    p_safe = tf.reshape(p_safe, shape_op)
    return p_safe

def sparsemax(logits, axis=-1):
    
    return Lambda(lambda x: _compute_2d_sparsemax(x))(logits)

In [4]:
x = tf.random.stateless_binomial(shape=(5, 10), seed=(1, 2), counts=5, probs=0.5)
x = tf.cast(x, "float32")

x, sparsemax(x)

2023-04-26 23:57:35.074139: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-04-26 23:57:35.074169: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (al3615): /proc/driver/nvidia/version does not exist
2023-04-26 23:57:35.075544: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


(<tf.Tensor: shape=(5, 10), dtype=float32, numpy=
 array([[3., 3., 3., 3., 4., 2., 3., 0., 3., 2.],
        [3., 3., 2., 3., 2., 2., 3., 2., 3., 1.],
        [3., 1., 2., 3., 3., 2., 3., 3., 3., 4.],
        [3., 3., 3., 4., 0., 3., 3., 2., 3., 2.],
        [5., 1., 3., 2., 1., 3., 3., 3., 2., 1.]], dtype=float32)>,
 <tf.Tensor: shape=(5, 10), dtype=float32, numpy=
 array([[0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.20000005, 0.20000005, 0.        , 0.20000005, 0.        ,
         0.        , 0.20000005, 0.        , 0.20000005, 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 1.        ],
        [0.        , 0.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        , 0.        , 0.        ,
       

In [5]:
class GLULayer(tf.keras.layers.Layer):
    def __init__(self, units):
        super(GLULayer, self).__init__()
        self.forward = None
        self.units = units*2
        self.dense = Dense(self.units)

    def call(self, inputs):
        x = self.dense(inputs)
        x1, x2 = tf.split(x, 2, axis=-1)
        return x1 * sigmoid(x2)

In [6]:
# Test Glu Layer on normal data
glu = GLULayer(10)
x = tf.random.normal((1, 7))
y = glu(x)
y

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[-0.96361774,  0.6793994 ,  0.1317401 ,  0.04591553, -0.35321206,
        -0.21881306,  0.15043025,  0.13077098, -0.03897366,  0.29308307]],
      dtype=float32)>

In [7]:
class SharedFeatureLayer(Layer):
    def __init__(self, units, depth=2, dense_activation="relu", *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.depth = depth
        self.dense_layers = [Dense(units, activation=dense_activation) for _ in range(depth)]
        self.bn_layers = [BatchNormalization() for _ in range(depth)]
        self.glu_layers = [GLULayer(units) for _ in range(depth)]
        self.scaling = Rescaling(0.5**0.5)
    
    def call(self, inputs, *args, **kwargs):
        x = inputs
        for i in range(self.depth):
            y = self.dense_layers[i](x)
            y = self.bn_layers[i](y)
            y = self.glu_layers[i](y)
            # Skip first residual connection as input is not guaranteed to be same shape as num units
            if i > 0:
                x = self.scaling(x + y)
            else:
                x = y
        return x

In [8]:
# Test Shared Feature Layer on normal data
sfl = SharedFeatureLayer(units=10)
x = tf.random.normal((1, 7))
y = sfl(x)
y

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[ 0.02939319,  0.44278663, -0.6480196 ,  0.09077926,  0.07966833,
         0.15026082, -0.13508579,  0.09931763, -0.06332603, -0.20511775]],
      dtype=float32)>

In [17]:
class FeatureTransformer(Layer):
    def __init__(self, units, shared_layer, dense_activation="relu", depth=2, *args, **kwargs):
        super().__init__( *args, **kwargs )
        self.shared_layer = shared_layer
        self.depth = depth
        self.dense_layers = [Dense(units, activation=dense_activation) for _ in range(depth)]
        self.bn_layers = [BatchNormalization() for _ in range(depth)]
        self.glu_layers = [GLULayer(units) for _ in range(depth)]
        self.scaling = Rescaling(0.5**0.5)

    def call(self, data):
        x = self.shared_layer(data)
        for i in range(self.depth):
            y = self.dense_layers[i](x)
            y = self.bn_layers[i](y)
            y = self.glu_layers[i](y)
            x = self.scaling(x + y)

        return x # Feature Transformer

In [18]:
# test Feature Transformer on normal data
sfl = SharedFeatureLayer(units=10)
ft = FeatureTransformer(units=10, shared_layer=sfl)
x = tf.random.normal((1, 7))
y = ft(x)
y

<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[-0.11730861,  0.12405972, -0.01578275,  0.04211384,  0.0752199 ,
        -0.0251449 , -0.0819028 ,  0.04296739,  0.01050517, -0.02078931]],
      dtype=float32)>

In [19]:
class AttentiveTransformer(Layer):
    def __init__(self, units, gamma=1.3, dense_activation="relu", *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.units = units
        self.gamma = tf.constant(gamma, dtype=tf.float32)
        self.activation = dense_activation
        self.built = False
        self.dense = Dense(self.units, activation=dense_activation)
        self.bn = BatchNormalization()
    
    def call(self, data):
        x = self.dense(data)
        x = self.bn(x)
        return x # Attentive
        

In [40]:
class TabNet(Model):
    def __init__(self, dim_features, dim_attention, dim_output, sparsity=0.0001, num_steps=5, gamma=1.5, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_step = num_steps
        self.dim_features = dim_features
        self.dim_attention = dim_attention # Currently keep it simple and just have equal attention and pre_output dimensions
        self.dim_pre_output = dim_attention
        self.dim_output = dim_output
        self.gamma = tf.constant(gamma)
        self.eps = tf.constant(1e-5)
        self.sparsity_coef = sparsity
        self.priors = []
        self.shared_layer = SharedFeatureLayer(units=self.dim_attention+self.dim_pre_output, depth=2, name="shared_feature_layer")
        self.feature_transformers = [FeatureTransformer(units=self.dim_attention+self.dim_pre_output, shared_layer=self.shared_layer, name=f"feat_{i}") for i in range(num_steps + 1)]
        self.attention_layers = [AttentiveTransformer(units=self.dim_features, name=f"attn_{i+1}") for i in range(num_steps)]
        self.norm_in = BatchNormalization(name="norm_in")
        self.output_dense = Dense(dim_output, name="output")
        self.attn_activation = _compute_2d_sparsemax
    
    def call(self, data):
        normed_data = self.norm_in(data)

        d0, a_i = tf.split(self.feature_transformers[0](normed_data), 2, axis=-1)
        decision = tf.zeros_like(d0)

        for i in range(self.num_step):
            candidate_mask = self.attention_layers[i](a_i)
            # print("before_priors", candidate_mask)
            for prior in self.priors:
                candidate_mask = candidate_mask*(self.gamma - prior)
            # print("after_priors", candidate_mask)
            candidate_mask = self.attn_activation(candidate_mask)

            if self.sparsity_coef > 0:
                # print(candidate_mask)
                decision_entropy = -tf.reduce_mean(tf.reduce_sum(candidate_mask, axis=-1))/self.num_step
                self.add_loss(lambda : self.sparsity_coef * decision_entropy)

            self.priors.append(candidate_mask)
            
            
            normed_features = normed_data * candidate_mask
            x = self.feature_transformers[i+1](normed_features)
            d_i, a_i = tf.split(x, 2, axis=-1)
            decision += relu(d_i)
        
        return self.output_dense(decision)


In [41]:
# Define data

BATCH_SIZE = 16384

# Load telematics data
telematics = pd.read_csv("./telematics_syn-032021.csv")
response = 'AMT_Claim'
outputs = ['NB_Claim']
# outputs = telematics.pop(["NB_Claim","AMT_Claim"])
# response = outputs['AMT_Claim']

# Split data into train and test
data_train, data_test = train_test_split(telematics, test_size=0.3, random_state=42)
data_val, data_test = train_test_split(data_test, test_size=0.33, random_state=42)

# Mean target encode categorical variables
aggregate_fn = np.mean
for col in ['Marital', 'Insured.sex', 'Car.use', 'Region', 'Territory']:
    encoding = data_train.groupby(col)[response].aggregate(aggregate_fn)
    data_train[col] = data_train[col].map(encoding)
    data_val[col] = data_val[col].map(encoding)
    data_test[col] = data_test[col].map(encoding)

# Split into X and y
X_train = data_train.drop(outputs, axis=1)
y_train = data_train[response]
X_val = data_val.drop(outputs, axis=1)
y_val = data_val[response]
X_test = data_test.drop(outputs, axis=1)
y_test = data_test[response]

# Make tensorflow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values))
train_dataset = train_dataset.shuffle(buffer_size=4048).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val.values, y_val.values))
val_dataset = val_dataset.batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test.values))
test_dataset = test_dataset.batch(BATCH_SIZE)

In [42]:
tabnet = TabNet(dim_features=X_train.shape[1], dim_attention=64, dim_output=1)



tabnet.compile( 
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), 
    loss=tf.keras.losses.MeanSquaredError(),
    run_eagerly=False,
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)
dummy = X_train.sample(1).values
tabnet.build(dummy.shape)


In [43]:
tabnet.summary()

Model: "tab_net_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared_feature_layer (Shared multiple                  90240     
_________________________________________________________________
feat_0 (FeatureTransformer)  multiple                  190336    
_________________________________________________________________
feat_1 (FeatureTransformer)  multiple                  190336    
_________________________________________________________________
feat_2 (FeatureTransformer)  multiple                  190336    
_________________________________________________________________
feat_3 (FeatureTransformer)  multiple                  190336    
_________________________________________________________________
feat_4 (FeatureTransformer)  multiple                  190336    
_________________________________________________________________
feat_5 (FeatureTransformer)  multiple                  19

In [44]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=6, restore_best_weights=True)
lr_on_plateau = tf.keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5, min_lr=1e-5, verbose=1)

history = tabnet.fit(train_dataset, epochs=100, validation_data=val_dataset, callbacks=[early_stopping, lr_on_plateau])

Epoch 1/100


TypeError: An op outside of the function building code is being passed
a "Graph" tensor. It is possible to have Graph tensors
leak out of the function building context by including a
tf.init_scope in your function building code.
For example, the following function will fail:
  @tf.function
  def has_init_scope():
    my_constant = tf.constant(1.)
    with tf.init_scope():
      added = my_constant * 2
The graph tensor has name: Reshape_2:0

In [None]:
# Plot history loss and RMSE for training and validation set; train solid line, validation dashed line
fig, (top_ax, bottom_ax) = plt.subplots(2, 1, figsize=(10, 10), sharex=True)

top_ax.plot(history.history['loss'], label='train_loss', c='b')
top_ax.plot(history.history['val_loss'], label='val_loss', linestyle='--', c='b')
second_ax = top_ax.twinx()
second_ax.plot(history.history['root_mean_squared_error'], label='train_rmse', c='orange')
second_ax.plot(history.history['val_root_mean_squared_error'], label='val_rmse', linestyle='--', c='orange')
top_ax.set_ylabel('loss')
second_ax.set_ylabel('RMSE')
top_ax.legend()
second_ax.legend()


bottom_ax.plot(history.history['lr'], label='lr', c='g')
bottom_ax.set_xlabel('Epoch')
bottom_ax.set_ylabel('Learning rate')




In [None]:
# Evaluate model on test set
tabnet.evaluate(test_dataset)