In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
import pandas as pd

df = pd.read_csv("datasets/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [6]:
# Downsampling
df_spam = df[df["Category"] == "spam"]
df_spam.shape

(747, 2)

In [8]:
df_ham = df[df["Category"] == "ham"].sample(747)
df_ham.shape

(747, 2)

In [9]:
df_balance = pd.concat([df_spam, df_ham])
df_balance.shape

(1494, 2)

In [10]:
df_balance.Category.value_counts()

Category
spam    747
ham     747
Name: count, dtype: int64

In [12]:
df_balance.sample(3)

Unnamed: 0,Category,Message
2338,ham,Tell your friends what you plan to do on Valen...
4520,ham,Just got part Nottingham - 3 hrs 63miles. Good...
5164,spam,Congrats 2 mobile 3G Videophones R yours. call...


In [14]:
df_balance['spam'] = df_balance['Category'].apply(lambda x: 1 if x == "spam" else 0)
df_balance.sample(5)

Unnamed: 0,Category,Message,spam
5378,spam,Free entry to the gr8prizes wkly comp 4 a chan...,1
433,spam,Congrats! Nokia 3650 video camera phone is you...,1
3711,ham,ARE YOU IN TOWN? THIS IS V. IMPORTANT,0
2204,ham,soon you will have the real thing princess! Do...,0
1142,spam,I don't know u and u don't know me. Send CHAT ...,1


In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balance['Message'],
    df_balance['spam'],
    test_size=0.2,
    stratify=df_balance["spam"]
)

In [45]:
X_train.shape, y_train.shape

((1195,), (1195,))

In [46]:
X_train.head()

168     Hi frnd, which is best way to avoid missunders...
420     Send a logo 2 ur lover - 2 names joined by a h...
2365    Ok then no need to tell me anything i am going...
2940          Are you ok. What happen to behave like this
2480    Sppok up ur mob with a Halloween collection of...
Name: Message, dtype: object

In [47]:
preprocess_url = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
encoder_url = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-l-12-h-768-a-12/4"

In [48]:
bert_preprocess = hub.KerasLayer(preprocess_url)
bert_encoder = hub.KerasLayer(encoder_url)

In [49]:
def get_sentence_embedding(sentences):
    text_preprocessed = bert_preprocess(sentences)
    bert_result = bert_encoder(text_preprocessed)
    return bert_result['pooled_output']

get_sentence_embedding([
    "What do U want for Xmas?",
    "Hiya, had a good day?"
])

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.9283166 , -0.52188164, -0.90034574, ..., -0.7631455 ,
        -0.7521075 ,  0.94395846],
       [-0.8889644 , -0.41560078, -0.89829564, ..., -0.7373032 ,
        -0.7248404 ,  0.89979535]], dtype=float32)>

In [50]:
e = get_sentence_embedding([
    "banana",
    "grapes",
    "mango", 
    "jeff bezos", 
    "elon musk",
    "bill gates"
])

In [51]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([e[4]], [e[3]])

array([[0.98720354]], dtype=float32)

**Sequential Vs Functional Model**

https://medium.com/@yashjhawi/tensorflow-keras-sequential-api-vs-functional-api-eaed7c96902b

So far we have built sequential model. But below we will build functional model. More information on these two is here: https://becominghuman.ai/sequential-vs-functional-model-in-keras-20684f766057

In [52]:
class Preprocess(tf.keras.layers.Layer):
    def call(self, inputs):
        return bert_preprocess(inputs)

class Encoder(tf.keras.layers.Layer):
    def call(self, inputs):
        return bert_encoder(inputs)

preprocess_model = Preprocess()
encoder_model = Encoder()
        
# BERT Layer
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
preprocessed_text = preprocess_model(text_input)
outputs = encoder_model(preprocessed_text)

# Neural Network Layer
dropout = tf.keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output'])
last = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(dropout)

# construct final model
model = tf.keras.Model(inputs=[text_input], outputs=[last])

In [53]:
model.summary()

In [54]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name="recall")
]

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=METRICS)

In [55]:
X_train = X_train.astype(str).to_numpy()

model.fit(X_train, y_train, epochs=10)

Epoch 1/10


Expected: ['text']
Received: inputs=Tensor(shape=(None,))


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 2s/step - accuracy: 0.5990 - loss: 0.6631 - precision: 0.6128 - recall: 0.6752
Epoch 2/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 2s/step - accuracy: 0.7250 - loss: 0.5305 - precision: 0.7902 - recall: 0.6589
Epoch 3/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - accuracy: 0.8462 - loss: 0.4399 - precision: 0.8247 - recall: 0.8897
Epoch 4/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - accuracy: 0.8618 - loss: 0.4032 - precision: 0.8538 - recall: 0.8767
Epoch 5/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2s/step - accuracy: 0.8910 - loss: 0.3549 - precision: 0.8779 - recall: 0.9171
Epoch 6/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2s/step - accuracy: 0.8724 - loss: 0.3497 - precision: 0.8391 - recall: 0.9107
Epoch 7/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2s/st

<keras.src.callbacks.history.History at 0x309dfd520>

In [40]:
X_test = X_test.astype(str).to_numpy()

model.evaluate(X_test, y_test)

3327    Huh so fast... Dat means u havent finished pai...
810       Ugh I don't wanna get out of bed. It's so warm.
870     What do U want for Xmas? How about 100 free te...
4746    Camera - You are awarded a SiPix Digital Camer...
2174    Hiya, had a good day? Have you spoken to since...
Name: Message, dtype: object

In [42]:
X_train.shape, y_train.shape

((5572,), (1195,))