In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [6]:
import pandas as pd

df = pd.read_csv("spam.csv")
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [8]:
df["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [9]:
df_spam = df[df["Category"]=='spam']
df_ham = df[df["Category"]=='ham']
print(df_ham.shape, df_spam.shape)


(4825, 2) (747, 2)


In [10]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])

In [11]:
df_balanced = pd.concat([df_spam,df_ham_downsampled])
df_balanced["Category"].value_counts()

Category
spam    747
ham     747
Name: count, dtype: int64

In [12]:
df_balanced["spam"] = df_balanced['Category'].apply(lambda x: 1 if x=="spam" else 0)
df_balanced.sample(5)

Unnamed: 0,Category,Message,spam
4976,ham,Yar... I tot u knew dis would happen long ago ...,0
4016,spam,You will be receiving this week's Triple Echo ...,1
2420,spam,SMS SERVICES For your inclusive text credits p...,1
3382,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ...",1
1826,ham,Wat makes some people dearer is not just de ha...,0


In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_balanced["Message"],df_balanced['spam'], stratify=df_balanced['spam'])

In [14]:
x_train.head(4)

1475    Friendship is not a game to play, It is not a ...
3091    Am going to take bath ill place the key in win...
2430    Guess who am I?This is the first time I create...
4860    Hey, a guy I know is breathing down my neck to...
Name: Message, dtype: object

In [4]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [16]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [18]:
e = get_sentence_embeding(['banana', 'grapes', 'mango', 'jeff bezos', 'elon musk', 'bill gates'])
e

<tf.Tensor: shape=(6, 768), dtype=float32, numpy=
array([[-0.7606918 , -0.1421939 ,  0.49604574, ...,  0.42165285,
        -0.5322141 ,  0.80312175],
       [-0.86023235, -0.21242955,  0.49156868, ...,  0.39797997,
        -0.60506296,  0.8447167 ],
       [-0.7128861 , -0.154639  ,  0.38401723, ...,  0.3527877 ,
        -0.5099134 ,  0.734741  ],
       [-0.82533467, -0.3555055 , -0.5906963 , ..., -0.01613665,
        -0.61417556,  0.8723029 ],
       [-0.75041366, -0.26812616, -0.26689747, ...,  0.02839359,
        -0.59380996,  0.797499  ],
       [-0.78544384, -0.29949707,  0.41027373, ...,  0.52225375,
        -0.49573553,  0.81507534]], dtype=float32)>

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([e[0]], [e[1]])

array([[0.9911088]], dtype=float32)

In [21]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")

prepoccesed_text = bert_preprocess(text_input)

outputs = bert_encoder(prepoccesed_text)

l1 = tf.keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output'])
l2 = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(l1)

model = tf.keras.Model(inputs=[text_input], outputs=[l2])

model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [22]:
METRICS = [tf.keras.metrics.BinaryAccuracy(name='accuracy'),
           tf.keras.metrics.Precision(name='precision'),
           tf.keras.metrics.Recall(name='recall')]

model.compile(optimizer='adam', loss='binary_crossentropy',metrics=METRICS)

In [24]:
model.fit(x_train,y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x22d9304d6f0>

In [25]:
y_predicted = model.predict(x_test)
y_predicted = y_predicted.flatten()




In [27]:
import numpy as np

y_predicted = np.where(y_predicted > 0.5, 1,0)
y_predicted

array([1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,

In [28]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test,y_predicted)
cm


array([[173,  14],
       [ 12, 175]], dtype=int64)

In [None]:
from matplotlib import pyplot as plt
import seaborn as sb

sb.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')


In [30]:
model.save_weights("model_1_weights.h5")

In [33]:
reviews = [
    'Enter a chance to win $5000, hurry up, offer valid until march 31, 2021',
    'You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99',
    'it to 80488. Your 500 free text messages are valid until 31 December 2005.',
    'Hey Sam, Are you coming for a cricket game tomorrow',
    "Why don't you wait 'til at least wednesday to see if you get your ."
]
model.predict(reviews)



array([[0.7315312 ],
       [0.83868253],
       [0.77738756],
       [0.22029448],
       [0.12724479]], dtype=float32)

In [36]:
def predict_spam_probability(texts):
    predictions = model.predict(texts)
    return predictions.flatten()

In [37]:
predictions = predict_spam_probability(reviews)

for text, probability in zip(reviews, predictions):
    print(f'The probability is {probability} for this string: {text}')

The probability is 0.7315312027931213 for this string: Enter a chance to win $5000, hurry up, offer valid until march 31, 2021
The probability is 0.8386825323104858 for this string: You are awarded a SiPix Digital Camera! call 09061221061 from landline. Delivery within 28days. T Cs Box177. M221BP. 2yr warranty. 150ppm. 16 . p pÂ£3.99
The probability is 0.7773875594139099 for this string: it to 80488. Your 500 free text messages are valid until 31 December 2005.
The probability is 0.22029447555541992 for this string: Hey Sam, Are you coming for a cricket game tomorrow
The probability is 0.12724478542804718 for this string: Why don't you wait 'til at least wednesday to see if you get your .


In [3]:
print(tf.__version__)

2.10.1
