In [1]:
import tensorflow as tf

# Numerical Features Preprocessing

### 1] tf.keras.layers.Normalization

Normalizes the input data by subtracting the mean and dividing by the standard deviation.

$\text{Normalized Value} = \frac{X - \mu}{\sigma}\$


In [18]:
data=tf.constant([[90.0,75.0],[3.0,4.0],[50.0,25.0]],dtype=tf.float32)

In [19]:
normalization_layer=tf.keras.layers.Normalization()

In [20]:
# Adapt the layer to the data -- crucial step
'''
1] When you create a Normalization layer, it needs to know the mean and variance of your data 
    in order to standardize it (subtract the mean and divide by the standard deviation). 
2] The adapt method computes these statistics.
'''
normalization_layer.adapt(data)

In [21]:
normalized_data=normalization_layer(data)

In [22]:
normalized_data.numpy()

array([[ 1.1906134 ,  1.3543456 ],
       [-1.2562379 , -1.0297505 ],
       [ 0.06562433, -0.32459527]], dtype=float32)

### 2] tf.keras.layers.Discretization

For the given input X, if the bins are defined by b1,b2,....,bn the discretized value D is :

D = 
\begin{cases} 
0 & \text{if } X < b_1 \\
1 & \text{if } b_1 \leq X < b_2 \\
\vdots & \\
n & \text{if } X \geq b_{n-1}
\end{cases}




In [23]:
data = tf.constant([[0.1, 0.3], [0.2, 0.4], [0.3, 0.6], [0.5, 0.9], [0.8, 0.7]], dtype=tf.float32)

In [30]:
# This will create 3 bins: [-inf, 0.2), [0.2, 0.5), [0.5, inf)
discretized=tf.keras.layers.Discretization(bin_boundaries=[0.2,0.5])

In [31]:
discrete_values=discretized(data)

In [32]:
discrete_values.numpy()

array([[0, 1],
       [1, 1],
       [1, 2],
       [2, 2],
       [2, 2]], dtype=int64)

# Categorical Features Preprocessing

### 1] tf.keras.layers.CategoryEncoding

For a categorical variable X with n categories, the one-hot encoded vector O is :

O_i = \begin{cases} 
1 & \text{if } X = \text{category}_i \\
0 & \text{otherwise}
\end{cases}


In [34]:
integer_data=tf.constant([0,1,2,1,2,0],dtype=tf.int32)

In [35]:
#num_tokens: 3 (since your data has values 0, 1, and 2)
onehot_int=tf.keras.layers.CategoryEncoding(num_tokens=3,output_mode='one_hot')

In [36]:
encoded_data=onehot_int(integer_data)

In [37]:
encoded_data.numpy()

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

### 2] tf.keras.layers.Hashing

For a given input X, the Hashed value H is:

H = $\text{hash}(X) \mod k\$

In [54]:
'''
The higher the num_bins, the more unique hash values can be accommodated, 
potentially reducing collisions (different inputs hashing to the same bin).
'''
data = tf.constant(["apple", "banana", "cherry", "apple"], dtype=tf.string)

In [64]:
hashing=tf.keras.layers.Hashing(num_bins=4)

In [65]:
hashed_data=hashing(data)

In [66]:
hashed_data.numpy()

array([1, 0, 3, 1], dtype=int64)

### 3] tf.keras.layers.StringLookup

In [68]:
data = tf.constant(["apple", "banana", "cherry", "banana"], dtype=tf.string)

In [69]:
look=tf.keras.layers.StringLookup()

In [70]:
look.adapt(data)

In [71]:
encode=look(data)

In [72]:
encode.numpy()

array([3, 1, 2, 1], dtype=int64)

### 4] tf.keras.layers.IntegerLookup

In [73]:
data = tf.constant([1,2,5,6,1,6], dtype=tf.int32)

In [74]:
int_look=tf.keras.layers.IntegerLookup()

In [75]:
int_look.adapt(data)

In [None]:
encode