In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])

In [3]:
vectorizer=keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',output_mode='int')

In [4]:
vectorizer.adapt(text_dataset)

In [5]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorizer)

In [6]:
input_data = [["foo qux bar"], ["qux baz"]]

In [7]:
model.predict(input_data)



array([[2, 1, 4],
       [1, 3, 0]], dtype=int64)

In [8]:
vectorizer.get_vocabulary()

['', '[UNK]', 'foo', 'baz', 'bar']

In [9]:
df=pd.read_csv('spam.csv')
df.head(4)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [11]:
vectorizer_layer=keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',output_mode='multi_hot')

In [12]:
vectorizer_layer.adapt(df.Message)

In [13]:
vectorizer_layer(df.Message[0:1])

<tf.Tensor: shape=(1, 9660), dtype=float32, numpy=array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [43]:
len(vectorizer_layer.get_vocabulary()),vectorizer_layer.get_config()

(9660,
 {'name': 'text_vectorization_1',
  'trainable': True,
  'dtype': 'string',
  'batch_input_shape': (None, None),
  'max_tokens': None,
  'standardize': 'lower_and_strip_punctuation',
  'split': 'whitespace',
  'ngrams': None,
  'output_mode': 'multi_hot',
  'output_sequence_length': None,
  'pad_to_max_tokens': False,
  'sparse': False,
  'ragged': False,
  'vocabulary': None,
  'idf_weights': None,
  'encoding': 'utf-8'})

In [14]:
def change_category(val):
    if val=='spam':
        return 1
    return 0

In [15]:
df.Category=df.Category.map(change_category)
df.Category[:5]

0    0
1    0
2    1
3    0
4    0
Name: Category, dtype: int64

In [16]:
X=df.drop(columns=['Category'])
y=df.Category

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=3)

In [18]:
len(X_train),len(X_test)

(4457, 1115)

In [19]:
model=keras.models.Sequential([
    keras.Input(shape=(1,)),
    vectorizer_layer,
    keras.layers.Dense(128,activation='relu'),
    keras.layers.Dense(64,activation='relu'),
    keras.layers.Dense(32,activation='relu'),
    keras.layers.Dense(1,activation='softmax'),
])
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [20]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_1 (TextV  (None, 9660)             0         
 ectorization)                                                   
                                                                 
 dense (Dense)               (None, 128)               1236608   
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,246,977
Trainable params: 1,246,977
Non-trainable params: 0
____________________________________________

In [21]:
def preprocess(x):
    return vectorizer_layer(x)

def forward_pass(x):
    dense=keras.layers.Dense(128,activation='relu')(x)
    dense2=keras.layers.Dense(64,activation='relu')(dense)
    dense3=keras.layers.Dense(32,activation='relu')(dense2)
    output=keras.layers.Dense(1,activation='sigmoid')(dense3)
    return output

In [22]:
inputs=keras.Input(shape=(1,),dtype='string')
outputs=forward_pass(preprocess(inputs))
model=keras.Model(inputs,outputs)

In [23]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [24]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 9660)             0         
 ectorization)                                                   
                                                                 
 dense_4 (Dense)             (None, 128)               1236608   
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                             

In [25]:
model.fit(X_train,y_train,epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x241a7d523e0>

In [26]:
model.evaluate(X_test,y_test)



[0.05065804347395897, 0.9847533702850342]

In [27]:
y_pred=model.predict(X_test)



In [28]:
y_pre = []
for element in y_pred:
    if element > 0.5:
        y_pre.append(1)
    else:
        y_pre.append(0)

In [29]:
from sklearn.metrics import confusion_matrix,classification_report

In [30]:
cm=confusion_matrix(y_test,y_pre)
cm

array([[956,   4],
       [ 13, 142]], dtype=int64)

In [31]:
print(classification_report(y_test,y_pre))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       960
           1       0.97      0.92      0.94       155

    accuracy                           0.98      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [32]:
adapt_data = np.array([[0., 7., 4.],[2., 9., 6.],[0., 7., 4.],[2., 9., 6.]], dtype='float32')
input_data = np.array([[0., 7., 4.]], dtype='float32')

In [33]:
layer = tf.keras.layers.Normalization(axis=-1)

In [34]:
layer.adapt(adapt_data)

In [35]:
layer(input_data)

<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[-1., -1., -1.]], dtype=float32)>

In [36]:
msg=df.Message[2]

In [37]:
input_data=np.array([[msg]])

In [38]:
input_data

array([["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]],
      dtype='<U155')

In [39]:
model.predict(input_data)



array([[0.9999968]], dtype=float32)

In [40]:
df.Category[2]

1