# Setup

In [1]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import tensorflow as tf
import pandas as pd
import keras
from keras.utils import FeatureSpace

In [2]:
!nvidia-smi

Tue Sep 10 01:57:04 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.86                 Driver Version: 551.86         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 ...  WDDM  |   00000000:09:00.0  On |                  N/A |
| 29%   46C    P0             57W /  250W |    2942MiB /   8192MiB |      2%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Preparing Data

In [3]:
file_url = "data/bank_customer_churn_dataset.csv"
df = pd.read_csv(file_url)

In [4]:
print(df.shape)

(10002, 14)


In [5]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [6]:
df = df.dropna()
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [7]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
5,645,Spain,Male,44.0,8,113755.78,2,1.0,0.0,149756.71,1


In [8]:
print(df.shape)

(9998, 11)


In [9]:
df.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard          float64
IsActiveMember     float64
EstimatedSalary    float64
Exited               int64
dtype: object

In [10]:
df = df.astype(dtype= {"HasCrCard":"int64",
                       "IsActiveMember":"int64"})

In [11]:
df.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                float64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [12]:
df.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,9998.0,9998.0,9998.0,9998.0,9998.0,9998.0,9998.0,9998.0,9998.0
mean,650.529606,38.920287,5.013003,76481.490819,1.530206,0.705541,0.514803,100099.786455,0.203841
std,96.633003,10.487986,2.892152,62393.187035,0.581669,0.455822,0.499806,57510.939962,0.402872
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,50983.75,0.0
50%,652.0,37.0,5.0,97173.29,1.0,1.0,1.0,100218.21,0.0
75%,718.0,44.0,7.0,127641.4175,2.0,1.0,1.0,149395.8825,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [13]:
df['Geography'].value_counts()

Geography
France     5012
Germany    2510
Spain      2476
Name: count, dtype: int64

In [14]:
df['Gender'].value_counts()

Gender
Male      5455
Female    4543
Name: count, dtype: int64

# Training/Validation Data Split
80-20 split

In [15]:
val_df = df.sample(frac=0.2, random_state=1337)
train_df = df.drop(val_df.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_df), len(val_df))
)

Using 7998 samples for training and 2000 for validation


In [16]:
label_col_name = "Exited"

def dataframe_to_dataset(df):
    df = df.copy()
    labels = df.pop(label_col_name)
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    ds = ds.shuffle(buffer_size=len(df))
    return ds

train_ds = dataframe_to_dataset(train_df)
val_ds = dataframe_to_dataset(val_df)

In [17]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

Input: {'CreditScore': <tf.Tensor: shape=(), dtype=int64, numpy=738>, 'Geography': <tf.Tensor: shape=(), dtype=string, numpy=b'Germany'>, 'Gender': <tf.Tensor: shape=(), dtype=string, numpy=b'Female'>, 'Age': <tf.Tensor: shape=(), dtype=float64, numpy=29.0>, 'Tenure': <tf.Tensor: shape=(), dtype=int64, numpy=9>, 'Balance': <tf.Tensor: shape=(), dtype=float64, numpy=139106.19>, 'NumOfProducts': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'HasCrCard': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'IsActiveMember': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'EstimatedSalary': <tf.Tensor: shape=(), dtype=float64, numpy=141872.05>}
Target: tf.Tensor(1, shape=(), dtype=int64)


In [18]:
batch_size = 32

train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

# FeatureSpace Configuration

In [19]:
feature_space = FeatureSpace(
    features={
        # Categorical features encoded as integers
        "Tenure": FeatureSpace.integer_categorical(num_oov_indices=0),
        "NumOfProducts": FeatureSpace.integer_categorical(num_oov_indices=0),
        "HasCrCard": FeatureSpace.integer_categorical(num_oov_indices=0),
        "IsActiveMember": FeatureSpace.integer_categorical(num_oov_indices=0),
        
        # Categorical feature encoded as string
        "Geography": FeatureSpace.string_categorical(num_oov_indices=0),
        "Gender": FeatureSpace.string_categorical(num_oov_indices=0),

        # Numerical features to discretize
        "Age": FeatureSpace.float_discretized(num_bins=30),
        "CreditScore": FeatureSpace.float_discretized(num_bins=50), # Sredit score ranges from 300 to 850

        # Numerical features to normalize
        "Balance": FeatureSpace.float_normalized(),
        "EstimatedSalary": FeatureSpace.float_normalized(),
    },
    output_mode="concat",
)

In [20]:
for x in train_ds.take(1):
    print(x)

({'CreditScore': <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([430, 608, 704, 644, 707, 619, 581, 578, 647, 796, 714, 650, 615,
       567, 551, 598, 614, 749, 713, 546, 755, 752, 634, 720, 698, 681,
       757, 529, 749, 563, 562, 751], dtype=int64)>, 'Geography': <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'Germany', b'Spain', b'France', b'Spain', b'Spain', b'France',
       b'France', b'France', b'France', b'Germany', b'Spain', b'Germany',
       b'France', b'Germany', b'France', b'Spain', b'France', b'France',
       b'France', b'France', b'Germany', b'Spain', b'France', b'Spain',
       b'France', b'France', b'Germany', b'Spain', b'Spain', b'France',
       b'France', b'Germany'], dtype=object)>, 'Gender': <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'Female', b'Female', b'Male', b'Male', b'Female', b'Female',
       b'Female', b'Male', b'Female', b'Female', b'Male', b'Female',
       b'Male', b'Male', b'Female', b'Male', b'Male', b'Male', b'Female',
     

In [21]:
for x in train_ds.map(lambda x, _: x).take(1):
    print(x)

{'CreditScore': <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([686, 667, 449, 609, 838, 726, 542, 678, 667, 587, 705, 554, 728,
       445, 609, 684, 584, 512, 689, 589, 656, 565, 575, 592, 542, 634,
       828, 554, 516, 850, 723, 556], dtype=int64)>, 'Geography': <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'Germany', b'Spain', b'France', b'Spain', b'Spain', b'Germany',
       b'France', b'Germany', b'France', b'France', b'France', b'France',
       b'Spain', b'France', b'Spain', b'France', b'France', b'France',
       b'France', b'France', b'Spain', b'France', b'France', b'France',
       b'France', b'Germany', b'France', b'Spain', b'France', b'Germany',
       b'France', b'France'], dtype=object)>, 'Gender': <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'Female', b'Female', b'Female', b'Male', b'Male', b'Female',
       b'Female', b'Male', b'Male', b'Male', b'Male', b'Female',
       b'Female', b'Female', b'Male', b'Female', b'Male', b'Female',
       b'Female

In [22]:
train_ds_with_no_labels = train_ds.map(lambda x, _: x)
feature_space.adapt(train_ds_with_no_labels)




In [23]:
for x, _ in train_ds.take(1):
    preprocessed_x = feature_space(x)
    print("preprocessed_x.shape:", preprocessed_x.shape)
    print("preprocessed_x.dtype:", preprocessed_x.dtype)

preprocessed_x.shape: (32, 106)
preprocessed_x.dtype: <dtype: 'float32'>


In [24]:
preprocessed_train_ds = train_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_train_ds = preprocessed_train_ds.prefetch(tf.data.AUTOTUNE)

preprocessed_val_ds = val_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_val_ds = preprocessed_val_ds.prefetch(tf.data.AUTOTUNE)

# Model Training

In [25]:
dict_inputs = feature_space.get_inputs()
encoded_features = feature_space.get_encoded_features()

x = keras.layers.Dense(64, activation="relu")(encoded_features)
x = keras.layers.Dense(32, activation="relu")(encoded_features)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(16, activation="relu")(encoded_features)
x = keras.layers.Dense(8, activation="relu")(encoded_features)
predictions = keras.layers.Dense(1, activation="sigmoid")(x)

learning_rate = 0.01
opt = keras.optimizers.Adam(learning_rate=learning_rate)
training_model = keras.Model(inputs=encoded_features, outputs=predictions)
training_model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)

inference_model = keras.Model(inputs=dict_inputs, outputs=predictions)

In [26]:
training_model.fit(
    preprocessed_train_ds,
    epochs=20,
    validation_data=preprocessed_val_ds,
    verbose=2,
)

Epoch 1/20
250/250 - 1s - 4ms/step - accuracy: 0.7752 - loss: 0.5170 - val_accuracy: 0.8125 - val_loss: 0.4358
Epoch 2/20
250/250 - 0s - 1ms/step - accuracy: 0.8165 - loss: 0.4119 - val_accuracy: 0.8495 - val_loss: 0.3805
Epoch 3/20
250/250 - 0s - 1ms/step - accuracy: 0.8448 - loss: 0.3685 - val_accuracy: 0.8595 - val_loss: 0.3610
Epoch 4/20
250/250 - 0s - 1ms/step - accuracy: 0.8513 - loss: 0.3534 - val_accuracy: 0.8600 - val_loss: 0.3547
Epoch 5/20
250/250 - 0s - 1ms/step - accuracy: 0.8552 - loss: 0.3486 - val_accuracy: 0.8620 - val_loss: 0.3534
Epoch 6/20
250/250 - 0s - 1ms/step - accuracy: 0.8562 - loss: 0.3459 - val_accuracy: 0.8590 - val_loss: 0.3544
Epoch 7/20
250/250 - 0s - 1ms/step - accuracy: 0.8580 - loss: 0.3440 - val_accuracy: 0.8510 - val_loss: 0.3589
Epoch 8/20
250/250 - 0s - 1ms/step - accuracy: 0.8570 - loss: 0.3429 - val_accuracy: 0.8545 - val_loss: 0.3562
Epoch 9/20
250/250 - 0s - 1ms/step - accuracy: 0.8591 - loss: 0.3419 - val_accuracy: 0.8495 - val_loss: 0.3568
E

<keras.src.callbacks.history.History at 0x2192034eff0>

# Inference

In [27]:
sample = {
    "CreditScore": 789,
    "Geography": "Germany",	
    "Gender": "Male",
    "Age": 23,
    "Tenure": 2,
    "Balance": 50000,
    "NumOfProducts": 1,
    "HasCrCard": 1,
    "IsActiveMember": 1,
    "EstimatedSalary": 100000
}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = inference_model.predict(input_dict)

print(
    f"This particular bank customer had a {100 * predictions[0][0]:.2f}% probability "
    "of churning/exiting, as evaluated by our model."
)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step
This particular bank customer had a 9.10% probability of churning/exiting, as evaluated by our model.
