In [None]:
!pip install tensorflow-text

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
import pandas as pd

df = pd.read_csv("/content/main.csv")

In [None]:
df.head(5)

Unnamed: 0,label,title
0,non-clickbait,"Masuk Radar Pilwalkot Medan, Menantu Jokowi Be..."
1,non-clickbait,Malaysia Sudutkan RI: Isu Kabut Asap hingga In...
2,clickbait,Viral! Driver Ojol di Bekasi Antar Pesanan Mak...
3,non-clickbait,"Kemensos Salurkan Rp 7,3 M bagi Korban Kerusuh..."
4,non-clickbait,"Terkait Mayat Bayi Mengenaskan di Tangerang, S..."


In [None]:
df.groupby("label").describe()

Unnamed: 0_level_0,title,title,title,title
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
clickbait,6290,6285,"Clustering, Jurus BRI Berdayakan Nasabah Ultra...",2
non-clickbait,8710,8693,"Jadwal MU vs Leicester City di Liga Inggris, S...",2


In [None]:
df_cb = df[df["label"] == "clickbait"]
df_cb.shape

(6290, 2)

In [None]:
df_ncb = df[df["label"] == "non-clickbait"]
df_ncb.shape

(8710, 2)

In [None]:
df_ncb_downsampled = df_ncb.sample(df_cb.shape[0])
df_ncb_downsampled.shape

(6290, 2)

In [None]:
df_balanced = pd.concat([df_cb, df_ncb_downsampled])

In [None]:
df_balanced["label"].value_counts()

non-clickbait    6290
clickbait        6290
Name: label, dtype: int64

In [None]:
df_balanced["clickbait"] = df_balanced["label"].apply(lambda x:1 if x=="clickbait" else 0)

In [None]:
df_balanced.head()

Unnamed: 0,label,title,clickbait
2,clickbait,Viral! Driver Ojol di Bekasi Antar Pesanan Mak...,1
11,clickbait,"Ada Motor Nyangkut di Atas Bambu di Sleman, Ko...",1
13,clickbait,Pesan Gamblang Poyuono Menolak Revisi UU KPK,1
37,clickbait,Detik-detik Lima Kendaraan Alami Kecelakaan Be...,1
38,clickbait,Kocak! Maling di Rumah Mewah Jakut Terekam CCT...,1


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['title'],df_balanced['clickbait'], stratify=df_balanced['clickbait'])

In [None]:
X_train.head(4)

10176    Ribuan Rumah Bakal Tergusur Proyek Double Trac...
9528     Menjajal Jaringan Ngebut Smartfren di Kawasan ...
7264     JK : Sound System Masjid kadang Mengganggu, Ak...
13962    Laudya Cynthia Bella Mewek Tanpa Cincin Kawin ...
Name: title, dtype: object

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "jokowi presiden kita", 
    "apa kabar kasus ahok?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84585255, -0.30646417, -0.5881597 , ..., -0.35923603,
        -0.5986587 ,  0.8222759 ],
       [-0.85944283, -0.502674  , -0.9666674 , ..., -0.91528577,
        -0.71611214,  0.8117752 ]], dtype=float32)>

In [None]:
coba = get_sentence_embeding(["jokowi", "prabowo","indonesia", 
                              "medan", "madura", "pisang"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([coba[0]],[coba[1]])

array([[0.981042]], dtype=float32)

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [None]:
len(X_train)

9435

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8eed5abe90>

In [None]:
model.evaluate(X_test, y_test)



[0.6149373054504395,
 0.6696343421936035,
 0.7401078939437866,
 0.5232040882110596]