In [2]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
from keras.utils import FeatureSpace
import pandas as pd
import tensorflow as tf
from pathlib import Path
from zipfile import ZipFile

In [3]:
data_url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
data_zipped_path = keras.utils.get_file("bank_marketing.zip", data_url, extract=True)
keras_datasets_path = Path(data_zipped_path).parents[0]
with ZipFile(f"{keras_datasets_path}/bank-additional.zip", "r") as zip:
    # Extract files
    zip.extractall(path=keras_datasets_path)

dataframe = pd.read_csv(
    f"{keras_datasets_path}/bank-additional/bank-additional.csv", sep=";"
)

Downloading data from https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
 851968/Unknown [1m1s[0m 1us/step

In [4]:
# Droping `duration` to avoid target leak
dataframe.drop("duration", axis=1, inplace=True)
# Creating the new feature `previously_contacted`
dataframe["previously_contacted"] = dataframe["pdays"].map(
    lambda x: 0 if x == 999 else 1
)

In [5]:
print(f"Dataframe shape: {dataframe.shape}")
print(dataframe.head())

Dataframe shape: (4119, 21)
   age          job  marital          education default  housing     loan  \
0   30  blue-collar  married           basic.9y      no      yes       no   
1   39     services   single        high.school      no       no       no   
2   25     services  married        high.school      no      yes       no   
3   38     services  married           basic.9y      no  unknown  unknown   
4   47       admin.  married  university.degree      no      yes       no   

     contact month day_of_week  ...  pdays  previous     poutcome  \
0   cellular   may         fri  ...    999         0  nonexistent   
1  telephone   may         fri  ...    999         0  nonexistent   
2  telephone   jun         wed  ...    999         0  nonexistent   
3  telephone   jun         fri  ...    999         0  nonexistent   
4   cellular   nov         mon  ...    999         0  nonexistent   

  emp.var.rate  cons.price.idx  cons.conf.idx  euribor3m  nr.employed   y  \
0         -1.8   

In [6]:
valid_dataframe = dataframe.sample(frac=0.2, random_state=0)
train_dataframe = dataframe.drop(valid_dataframe.index)

print(
    f"Using {len(train_dataframe)} samples for training and "
    f"{len(valid_dataframe)} for validation"
)

Using 3295 samples for training and 824 for validation


In [8]:
label_lookup = keras.layers.StringLookup(
    # the order here is important since the first index will be encoded as 0
    vocabulary=["no", "yes"],
    num_oov_indices=0,
)


def encode_label(x, y):
    encoded_y = label_lookup(y)
    return x, encoded_y


def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("y")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.map(encode_label, num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
valid_ds = dataframe_to_dataset(valid_dataframe)

In [9]:
for x, y in dataframe_to_dataset(train_dataframe).take(1):
    print(f"Input: {x}")
    print(f"Target: {y}")

Input: {'age': <tf.Tensor: shape=(), dtype=int64, numpy=60>, 'job': <tf.Tensor: shape=(), dtype=string, numpy=b'admin.'>, 'marital': <tf.Tensor: shape=(), dtype=string, numpy=b'married'>, 'education': <tf.Tensor: shape=(), dtype=string, numpy=b'basic.9y'>, 'default': <tf.Tensor: shape=(), dtype=string, numpy=b'no'>, 'housing': <tf.Tensor: shape=(), dtype=string, numpy=b'yes'>, 'loan': <tf.Tensor: shape=(), dtype=string, numpy=b'no'>, 'contact': <tf.Tensor: shape=(), dtype=string, numpy=b'cellular'>, 'month': <tf.Tensor: shape=(), dtype=string, numpy=b'aug'>, 'day_of_week': <tf.Tensor: shape=(), dtype=string, numpy=b'thu'>, 'campaign': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'pdays': <tf.Tensor: shape=(), dtype=int64, numpy=999>, 'previous': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'poutcome': <tf.Tensor: shape=(), dtype=string, numpy=b'nonexistent'>, 'emp.var.rate': <tf.Tensor: shape=(), dtype=float64, numpy=1.4>, 'cons.price.idx': <tf.Tensor: shape=(), dtype=float64, numpy=93

In [10]:
train_ds_with_no_labels = train_ds.map(lambda x, _: x)


def example_feature_space(dataset, feature_space, feature_names):
    feature_space.adapt(dataset)
    for x in dataset.take(1):
        inputs = {feature_name: x[feature_name] for feature_name in feature_names}
        preprocessed_x = feature_space(inputs)
        print(f"Input: {[{k:v.numpy()} for k, v in inputs.items()]}")
        print(
            f"Preprocessed output: {[{k:v.numpy()} for k, v in preprocessed_x.items()]}"
        )

In [11]:
feature_space = FeatureSpace(
    features={
        "campaign": FeatureSpace.integer_hashed(num_bins=4, output_mode="one_hot")
    },
    output_mode="dict",
)
example_feature_space(train_ds_with_no_labels, feature_space, ["campaign"])

Input: [{'campaign': 3}]
Preprocessed output: [{'campaign': array([0., 1., 0., 0.], dtype=float32)}]


In [12]:
feature_space = FeatureSpace(
    features={
        "education": FeatureSpace.string_hashed(num_bins=3, output_mode="one_hot")
    },
    output_mode="dict",
)
example_feature_space(train_ds_with_no_labels, feature_space, ["education"])

Input: [{'education': b'professional.course'}]
Preprocessed output: [{'education': array([0., 0., 1.], dtype=float32)}]


In [13]:
feature_space = FeatureSpace(
    features={"age": FeatureSpace.float_discretized(num_bins=3, output_mode="one_hot")},
    output_mode="dict",
)
example_feature_space(train_ds_with_no_labels, feature_space, ["age"])

Input: [{'age': 48}]
Preprocessed output: [{'age': array([0., 0., 1.], dtype=float32)}]


In [14]:
feature_space = FeatureSpace(
    features={
        "default": FeatureSpace.string_categorical(
            num_oov_indices=1, output_mode="one_hot"
        )
    },
    output_mode="dict",
)
example_feature_space(train_ds_with_no_labels, feature_space, ["default"])

Input: [{'default': b'no'}]
Preprocessed output: [{'default': array([0., 1., 0., 0.], dtype=float32)}]


In [15]:
feature_space = FeatureSpace(
    features={
        "previously_contacted": FeatureSpace.integer_categorical(
            num_oov_indices=0, output_mode="one_hot"
        )
    },
    output_mode="dict",
)
example_feature_space(train_ds_with_no_labels, feature_space, ["previously_contacted"])

Input: [{'previously_contacted': 0}]
Preprocessed output: [{'previously_contacted': array([1., 0.], dtype=float32)}]


In [None]:
feature_space = FeatureSpace(
    features={
        "age": FeatureSpace.integer_hashed(num_bins=6, output_mode="one_hot"),
        "job": FeatureSpace.string_categorical(
            num_oov_indices=0, output_mode="one_hot"
        ),
    },
    crosses=[
        FeatureSpace.cross(
            feature_names=("age", "job"),
            crossing_dim=8,
            output_mode="one_hot",
        )
    ],
    output_mode="dict",
)
example_feature_space(train_ds_with_no_labels, feature_space, ["age", "job"])

In [16]:
custom_layer = keras.layers.TextVectorization(output_mode="tf_idf")

feature_space = FeatureSpace(
    features={
        "education": FeatureSpace.feature(
            preprocessor=custom_layer, dtype="string", output_mode="float"
        )
    },
    output_mode="dict",
)
example_feature_space(train_ds_with_no_labels, feature_space, ["education"])

Input: [{'education': b'university.degree'}]
Preprocessed output: [{'education': array([0.       , 1.4574516, 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       ], dtype=float32)}]


In [17]:
feature_space = FeatureSpace(
    features={
        # Categorical features encoded as integers
        "previously_contacted": FeatureSpace.integer_categorical(num_oov_indices=0),
        # Categorical features encoded as string
        "marital": FeatureSpace.string_categorical(num_oov_indices=0),
        "education": FeatureSpace.string_categorical(num_oov_indices=0),
        "default": FeatureSpace.string_categorical(num_oov_indices=0),
        "housing": FeatureSpace.string_categorical(num_oov_indices=0),
        "loan": FeatureSpace.string_categorical(num_oov_indices=0),
        "contact": FeatureSpace.string_categorical(num_oov_indices=0),
        "month": FeatureSpace.string_categorical(num_oov_indices=0),
        "day_of_week": FeatureSpace.string_categorical(num_oov_indices=0),
        "poutcome": FeatureSpace.string_categorical(num_oov_indices=0),
        # Categorical features to hash and bin
        "job": FeatureSpace.string_hashed(num_bins=3),
        # Numerical features to hash and bin
        "pdays": FeatureSpace.integer_hashed(num_bins=4),
        # Numerical features to normalize and bin
        "age": FeatureSpace.float_discretized(num_bins=4),
        # Numerical features to normalize
        "campaign": FeatureSpace.float_normalized(),
        "previous": FeatureSpace.float_normalized(),
        "emp.var.rate": FeatureSpace.float_normalized(),
        "cons.price.idx": FeatureSpace.float_normalized(),
        "cons.conf.idx": FeatureSpace.float_normalized(),
        "euribor3m": FeatureSpace.float_normalized(),
        "nr.employed": FeatureSpace.float_normalized(),
    },
    # Specify feature cross with a custom crossing dim.
    crosses=[
        FeatureSpace.cross(feature_names=("age", "job"), crossing_dim=8),
        FeatureSpace.cross(feature_names=("housing", "loan"), crossing_dim=6),
        FeatureSpace.cross(
            feature_names=("poutcome", "previously_contacted"), crossing_dim=2
        ),
    ],
    output_mode="concat",
)

In [18]:
train_ds = train_ds.batch(32)
valid_ds = valid_ds.batch(32)

train_ds_with_no_labels = train_ds.map(lambda x, _: x)
feature_space.adapt(train_ds_with_no_labels)

In [19]:
for x, _ in train_ds.take(1):
    preprocessed_x = feature_space(x)
    print(f"preprocessed_x shape: {preprocessed_x.shape}")
    print(f"preprocessed_x sample: \n{preprocessed_x[0]}")

preprocessed_x shape: (32, 77)
preprocessed_x sample: 
[ 1.          0.          0.          0.         -0.58789206  1.5033054
 -1.9100648   1.          0.          0.          0.          0.
  1.          0.          1.          0.          0.          0.
  1.          0.          0.          0.          0.          0.
  0.         -1.9028114  -1.4860481   0.          1.          0.
  0.          1.          0.          1.          0.          0.
  0.          1.          0.          0.          0.          1.
  0.          0.          0.          0.          0.          0.
  0.          0.         -1.2171593   0.          0.          1.
  0.          0.          1.          0.          1.4951228   1.
  0.          0.          0.          0.          1.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          1.          0.          1.        ]


In [20]:
feature_space.save("myfeaturespace.keras")

In [21]:
preprocessed_train_ds = train_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

preprocessed_valid_ds = valid_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

In [22]:
encoded_features = feature_space.get_encoded_features()
print(encoded_features)

<KerasTensor shape=(None, 77), dtype=float32, sparse=False, name=keras_tensor_50>


In [23]:
x = keras.layers.Dense(64, activation="relu")(encoded_features)
x = keras.layers.Dropout(0.5)(x)
output = keras.layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=encoded_features, outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [24]:
model.fit(
    preprocessed_train_ds, validation_data=preprocessed_valid_ds, epochs=20, verbose=2
)

Epoch 1/20
103/103 - 6s - 54ms/step - accuracy: 0.8580 - loss: 0.3838 - val_accuracy: 0.9138 - val_loss: 0.2610
Epoch 2/20
103/103 - 1s - 7ms/step - accuracy: 0.8920 - loss: 0.3136 - val_accuracy: 0.9126 - val_loss: 0.2608
Epoch 3/20
103/103 - 1s - 12ms/step - accuracy: 0.8962 - loss: 0.2973 - val_accuracy: 0.9041 - val_loss: 0.2632
Epoch 4/20
103/103 - 1s - 8ms/step - accuracy: 0.9017 - loss: 0.2928 - val_accuracy: 0.9053 - val_loss: 0.2636
Epoch 5/20
103/103 - 2s - 16ms/step - accuracy: 0.8971 - loss: 0.2984 - val_accuracy: 0.9078 - val_loss: 0.2660
Epoch 6/20
103/103 - 1s - 9ms/step - accuracy: 0.9002 - loss: 0.2865 - val_accuracy: 0.9078 - val_loss: 0.2673
Epoch 7/20
103/103 - 1s - 7ms/step - accuracy: 0.9041 - loss: 0.2839 - val_accuracy: 0.9078 - val_loss: 0.2631
Epoch 8/20
103/103 - 1s - 12ms/step - accuracy: 0.9020 - loss: 0.2816 - val_accuracy: 0.9102 - val_loss: 0.2623
Epoch 9/20
103/103 - 1s - 12ms/step - accuracy: 0.8992 - loss: 0.2806 - val_accuracy: 0.9102 - val_loss: 0.2

<keras.src.callbacks.history.History at 0x7d5c00114850>

In [25]:
loaded_feature_space = keras.saving.load_model("myfeaturespace.keras")

In [27]:
dict_inputs = loaded_feature_space.get_inputs()
encoded_features = loaded_feature_space.get_encoded_features()
print(encoded_features)

print(dict_inputs)

outputs = model(encoded_features)
inference_model = keras.Model(inputs=dict_inputs, outputs=outputs)

sample = {
    "age": 60,
    "job": "blue-collar",
    "marital": "married",
    "education": "basic.9y",
    "default": "no",
    "housing": "yes",
    "loan": "no",
    "contact": "cellular",
    "month": "may",
    "day_of_week": "fri",
    "campaign": 2,
    "pdays": 999,
    "previous": 0,
    "poutcome": "nonexistent",
    "emp.var.rate": -1.8,
    "cons.price.idx": 92.893,
    "cons.conf.idx": -46.2,
    "euribor3m": 1.313,
    "nr.employed": 5099.1,
    "previously_contacted": 0,
}

input_dict = {
    name: keras.ops.convert_to_tensor([value]) for name, value in sample.items()
}
predictions = inference_model.predict(input_dict)

print(
    f"This particular client has a {100 * predictions[0][0]:.2f}% probability "
    "of subscribing a term deposit, as evaluated by our model."
)

<KerasTensor shape=(None, 77), dtype=float32, sparse=False, name=keras_tensor_93>
{'previously_contacted': <KerasTensor shape=(None, 1), dtype=int32, sparse=None, name=previously_contacted>, 'marital': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=marital>, 'education': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=education>, 'default': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=default>, 'housing': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=housing>, 'loan': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=loan>, 'contact': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=contact>, 'month': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=month>, 'day_of_week': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=day_of_week>, 'poutcome': <KerasTensor shape=(None, 1), dtype=string, sparse=None, name=poutcome>, 'job': <KerasTensor shape=(None, 1), dtype=string, sparse=