In [73]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
from sklearn.model_selection import train_test_split

In [27]:
df = pd.read_csv('https://raw.githubusercontent.com/diazoniclabs/Machine-Learning-using-sklearn/master/Datasets/spam.tsv',sep = '\t')
df

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,160,8
5568,ham,Will ü b going to esplanade fr home?,36,1
5569,ham,"Pity, * was in mood for that. So...any other s...",57,7
5570,ham,The guy did some bitching but I acted like i'd...,125,1


In [32]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [46]:
df_spam = df[df['label'] == 'spam']

In [47]:
df_ham = df[df['label'] == 'ham']

In [48]:
df_ham = df_ham.sample(df_spam.shape[0])

In [50]:
df_balanced = pd.concat([df_spam, df_ham])

In [51]:
df_balanced['label'].value_counts()

spam    747
ham     747
Name: label, dtype: int64

In [53]:
df_balanced['spam'] = df_balanced['label'].apply(lambda x: 1 if x=='spam' else 0)

In [54]:
df_balanced

Unnamed: 0,label,message,length,punct,spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6,1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,147,8,1
8,spam,WINNER!! As a valued network customer you have...,157,6,1
9,spam,Had your mobile 11 months or more? U R entitle...,154,2,1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",136,8,1
...,...,...,...,...,...
3625,ham,No message..no responce..what happend?,38,5,0
3602,ham,"Jay told me already, will do",28,1,0
1693,ham,Weightloss! No more girl friends. Make loads o...,99,4,0
582,ham,Ok anyway no need to change with what you said,46,0,0


In [57]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced['message'],df_balanced['spam'], stratify=df_balanced['spam'])

In [58]:
X_train.head()

5041    Natalie (20/F) is inviting you to be her frien...
1146    Thank you, winner notified by sms. Good Luck! ...
4679    It is a good thing I'm now getting the connect...
3620    8007 25p 4 Alfie Moon's Children in Need song ...
223                                Sorry, I'll call later
Name: message, dtype: object

In [59]:
bert_encoder = hub.KerasLayer('https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-12-h-768-a-12/versions/2')
bert_preprocess = hub.KerasLayer('https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3')

In [60]:
def get_sentence_embeding(sentence):
  preprocessed_text = bert_preprocess(sentence)
  return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding(["Bumper Sale! get 100% Off on clothes",'What is Going On?','You are doing Great'])

<tf.Tensor: shape=(3, 768), dtype=float32, numpy=
array([[ 0.13529672, -0.15651326,  0.97483724, ..., -0.25793895,
         0.05686133,  0.98966426],
       [ 0.33626324,  0.25967667, -0.19960463, ..., -0.99718386,
         0.25375572,  0.9982445 ],
       [ 0.32748172, -0.23830533,  0.7944568 , ..., -0.9047256 ,
         0.5093959 ,  0.9707309 ]], dtype=float32)>

In [61]:
e = get_sentence_embeding([
    'apple',
    'peach',
    'banana',
    'grapes',
    'mango',
    'Sunder Pichai',
    'Elon Musk',
    'Dwayne Jhonson',
    'CarryMinati'
])

In [62]:
e

<tf.Tensor: shape=(9, 768), dtype=float32, numpy=
array([[ 0.25694773, -0.02905397,  0.7929298 , ..., -0.20954858,
         0.33580613,  0.99979347],
       [ 0.08301385, -0.15217376,  0.8889554 , ...,  0.21246079,
         0.29683742,  0.99936295],
       [ 0.26732698, -0.19118258,  0.08356223, ..., -0.61833537,
         0.00336395,  0.9997838 ],
       ...,
       [-0.11955273, -0.5513144 ,  0.9718947 , ..., -0.881357  ,
         0.66398317,  0.97707385],
       [-0.12595616, -0.47890082,  0.9668572 , ..., -0.76727194,
         0.35574046,  0.99006367],
       [ 0.15493621, -0.12604938,  0.6508674 , ..., -0.9761009 ,
         0.15501283,  0.9994375 ]], dtype=float32)>

In [71]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([e[7]],[e[5]])

array([[0.83597165]], dtype=float32)

In [76]:
# Bert Layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural Network Layers
l = tf.keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output'])
print(l)
l = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(l)
print(l)

KerasTensor(type_spec=TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), name='dropout/Identity:0', description="created by layer 'dropout'")
KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.float32, name=None), name='output/Sigmoid:0', description="created by layer 'output'")


In [77]:
model = tf.keras.Model(inputs=[text_input], outputs=[l])

In [78]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 keras_layer_4 (KerasLayer)  {'input_type_ids': (None,    0         ['text[0][0]']                
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_word_ids': (None,                                           
                              128)}                                                               
                                                                                              

In [80]:
METRICS = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
model.fit(X_train,y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

In [17]:
preprocess_url = 'https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3'
encoder_url = 'https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/bert-en-uncased-l-12-h-768-a-12/versions/2'

In [9]:
bert_model = hub.KerasLayer(preprocess_url)

In [None]:
text = ['What is Going On?','You are doing Great']
text_preprocess = bert_model(text)
text_preprocess

In [20]:
enc_bert_model = hub.KerasLayer(encoder_url)
result = enc_bert_model(text_preprocess)

In [21]:
result.keys()

dict_keys(['encoder_outputs', 'sequence_output', 'default', 'pooled_output'])

In [None]:
result['sequence_output']