In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
data = pd.read_csv('SMSSpamCollection.tsv', sep='\t', names=['target', 'text'])
data.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.target.value_counts()

ham     4825
spam     747
Name: target, dtype: int64

In [4]:
data['target'] = data['target'].apply(lambda x: 1 if x == 'spam' else 0)

In [5]:
data.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['target'])

In [7]:
X_train

3566    We know TAJ MAHAL as symbol of love. But the o...
4363    I can't right this second, gotta hit people up...
28      I'm back &amp; we're packing the car now, I'll...
1067         Once free call me sir. I am waiting for you.
3468    URGENT! We are trying to contact U. Todays dra...
                              ...                        
538     Or maybe my fat fingers just press all these b...
1393    Thk shld b can... Ya, i wana go 4 lessons... H...
1270             Tee hee. Off to lecture, cheery bye bye.
154     As per your request 'Melle Melle (Oru Minnamin...
2410    Aww that's the first time u said u missed me w...
Name: text, Length: 4179, dtype: object

In [8]:
y_train

3566    0
4363    0
28      0
1067    0
3468    1
       ..
538     0
1393    0
1270    0
154     0
2410    0
Name: target, Length: 4179, dtype: int64

In [15]:
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')

In [17]:
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [18]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)
l1 = tf.keras.layers.Dropout(0.3, name='dropout')(outputs['pooled_output'])
l2 = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(l1)

In [19]:
model = tf.keras.Model(inputs=[text_input], outputs=[l2])

In [20]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_2 (KerasLayer)     {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [23]:
model.fit(X_train, y_train, epochs=3, batch_size=32)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1d2ce16b2e0>

In [24]:
y_predicted = model.predict(X_test)
y_predicted



array([[0.00787523],
       [0.11363889],
       [0.0120696 ],
       ...,
       [0.14111687],
       [0.1730822 ],
       [0.05307255]], dtype=float32)

In [25]:
y_predicted = y_predicted.flatten()
y_predicted

array([0.00787523, 0.11363889, 0.0120696 , ..., 0.14111687, 0.1730822 ,
       0.05307255], dtype=float32)

In [27]:
# MasoudKaviani.ir
# dataset: https://drive.google.com/file/d/1-1hCBHrF1mUvtk7sMlOvzgSgh3T52IMp/view