# Transformer & Bert

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv("/content/clickbait.csv")
df.head()

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [None]:
df.shape

(32000, 2)

In [None]:
df['clickbait'].value_counts()

0    16001
1    15999
Name: clickbait, dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headline   32000 non-null  object
 1   clickbait  32000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 500.1+ KB


Memeriksa jumlah maksimal kata yang dapat hadir dalam sebuah headline, ini akan membantu kita dalam proses padding nantinya.

In [None]:
df['headline'].apply(lambda x:len(str(x).split())).max()

26

## **DATA PREPARATION**

In [None]:
from keras.preprocessing import sequence, text

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(df.headline, df.clickbait,
                                                    stratify=df.clickbait.values,
                                                    random_state = 42,
                                                    test_size = 0.2,
                                                    shuffle=True)

X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp,
                                                stratify=y_temp.values,
                                                random_state = 42,
                                                test_size = 0.5,
                                                shuffle=True)

# Menampilkan shape dari setiap subset
print("Shape X_train:", X_train.shape)
print("Shape y_train:", y_train.shape)
print("Shape X_val:", X_val.shape)
print("Shape y_val:", y_val.shape)
print("Shape X_test:", X_test.shape)
print("Shape y_test:", y_test.shape)

Shape X_train: (25600,)
Shape y_train: (25600,)
Shape X_val: (3200,)
Shape y_val: (3200,)
Shape X_test: (3200,)
Shape y_test: (3200,)


**Tokenization** is the process of converting text into a sequence of numbers or tokens. In the context of its use in an RNN model, each word in a sentence is represented as a one-hot vector with a dimension equal to the number of words in the vocabulary + 1. The Keras Tokenizer is employed to create a dictionary of unique words in the corpus, sorting them based on their frequency of occurrence, and then assigning a numerical index to each word. This process results in numerical representations for each word in the sentence.

**Padding** is the process of adding zero values to a sequence of words so that all sequences have the same length. This is useful because RNN models require inputs with uniform length. Padding is performed to make the lengths of all sequences the same, regardless of the original length of the sentences.

In [None]:
# Using Keras Tokenizer.
token = text.Tokenizer(num_words=None)
max_len = 30

# Fit the tokenizer on the training and validation data.
token.fit_on_texts(list(X_train) + list(X_val))
X_train_seq = token.texts_to_sequences(X_train)
X_valid_seq = token.texts_to_sequences(X_val)
X_test_seq = token.texts_to_sequences(X_test)

# Zero pad the sequences to ensure uniform length.
X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=max_len, padding='pre')
X_valid_pad = sequence.pad_sequences(X_valid_seq, maxlen=max_len, padding='pre')
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=max_len, padding='pre')

# Obtain a dictionary mapping unique words in the dataset to numerical indices.
word_index = token.word_index

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, Activation, Dropout, BatchNormalization

## **Simple RNN**

In [None]:
RNN_model = Sequential()
RNN_model.add(Embedding(len(word_index) + 1,
                        300,
                        input_length=max_len))
RNN_model.add(SimpleRNN(50))
RNN_model.add(Dense(1, activation='sigmoid'))
RNN_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

RNN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 300)           6903900   
                                                                 
 simple_rnn (SimpleRNN)      (None, 50)                17550     
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 6921501 (26.40 MB)
Trainable params: 6921501 (26.40 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


The Sequential model is defined as a sequence of layers to be used in the model. The first layer is the Embedding layer, which converts the one-hot vector representation of words into a 300-dimensional embedding vector.

In [None]:
RNN_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_valid_pad, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7defb20d4f10>

**Model Comment**

This model may be experiencing overfitting as evidenced by a training accuracy of 1 while the validation accuracy does not improve, and the validation loss continues to rise while the training loss keeps decreasing.

To address this, adjusting hyperparameters, especially the number of neurons in the SimpleRNN layer, may be beneficial. The model might be too complex for this dataset. Additionally, introducing batch normalization and fine-tuning dropout could enhance model performance.

In [None]:
y_pred = RNN_model.predict(X_test_pad)



In [None]:
threshold = 0.5
y_pred_binary = (y_pred > threshold).astype(int)
y_pred_binary = y_pred_binary.flatten()
y_pred_binary

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
y_test_binary = y_test.values
y_test_binary

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [None]:
# Calculating metrics
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)
f1 = f1_score(y_test_binary, y_pred_binary)
roc_auc = roc_auc_score(y_test_binary, y_pred)

In [None]:
# Calculating confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)

# Displaying results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.9721875
Precision: 0.9713038053649408
Recall: 0.973125
F1 Score: 0.9722135497970652
ROC AUC Score: 0.996615234375

Confusion Matrix:
 [[1554   46]
 [  43 1557]]


## **LSTM's**

Because tokenization and padding have already been performed on the text, there is no need to do it again for LSTM.

In [None]:
from keras.layers import LSTM

In [None]:
embedding_dim = 300
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))

In [None]:
embedding_matrix.shape

(23013, 300)

In [None]:
LSTM_model = Sequential()
LSTM_model.add(Embedding(len(word_index) + 1,
                         300,
                         weights=[embedding_matrix],
                         input_length=max_len,
                         trainable=False))
LSTM_model.add(LSTM(50, dropout=0.3, recurrent_dropout=0.3))
LSTM_model.add(Dense(1, activation='sigmoid'))
LSTM_model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

LSTM_model.summary()



Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 30, 300)           6903900   
                                                                 
 lstm (LSTM)                 (None, 50)                70200     
                                                                 
 dense_1 (Dense)             (None, 1)                 51        
                                                                 
Total params: 6974151 (26.60 MB)
Trainable params: 70251 (274.42 KB)
Non-trainable params: 6903900 (26.34 MB)
_________________________________________________________________


Setting trainable=False on the embedding layer means that the weights of that embedding layer will not be updated or adjusted during the model training process. This is done to prevent overfitting and reduce computational load.

In [None]:
LSTM_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_valid_pad, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7def38b76920>

**Model Comment**

The model performs well as it consistently improves on the validation data. By the 5th epoch, there is a notable decrease in loss and a continuous increase in accuracy, indicating that the model has room for further development. This can be achieved by adding more epochs, increasing dropout, and adjusting hyperparameters, such as the number of units/neurons in the LSTM layer.

In [None]:
y_pred = LSTM_model.predict(X_test_pad)



In [None]:
threshold = 0.5
y_pred_binary = (y_pred > threshold).astype(int)
y_pred_binary = y_pred_binary.flatten()
y_pred_binary

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
y_test_binary = y_test.values
y_test_binary

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)
f1 = f1_score(y_test_binary, y_pred_binary)
roc_auc = roc_auc_score(y_test_binary, y_pred)

In [None]:
# Calculating confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)

# Display results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.9159375
Precision: 0.9530292716133424
Recall: 0.875
F1 Score: 0.9123492994460737
ROC AUC Score: 0.975738671875

Confusion Matrix:
 [[1531   69]
 [ 200 1400]]


## **GRU's**

In [None]:
from keras.layers import SpatialDropout1D, GRU

In [None]:
GRU_model = Sequential()
GRU_model.add(Embedding(len(word_index) + 1,
                        300,
                        weights=[embedding_matrix],
                        input_length=max_len,
                        trainable=False))
GRU_model.add(SpatialDropout1D(0.3))
GRU_model.add(GRU(300))
GRU_model.add(Dense(1, activation='sigmoid'))

GRU_model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

GRU_model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 30, 300)           6903900   
                                                                 
 spatial_dropout1d_1 (Spati  (None, 30, 300)           0         
 alDropout1D)                                                    
                                                                 
 gru (GRU)                   (None, 300)               541800    
                                                                 
 dense_2 (Dense)             (None, 1)                 301       
                                                                 
Total params: 7446001 (28.40 MB)
Trainable params: 542101 (2.07 MB)
Non-trainable params: 6903900 (26.34 MB)
_________________________________________________________________


In [None]:
GRU_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_valid_pad, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7def63975150>

**Model Comment**

The model exhibits good performance as it consistently improves on the validation data. By the 5th epoch, there is a noticeable decrease in loss and a continuous increase in accuracy, suggesting that the model can be further developed. This can be achieved by adding more epochs, increasing dropout, and adjusting hyperparameters, such as the number of units/neurons in the GRU layer.

In [None]:
y_pred = GRU_model.predict(X_test_pad)



In [None]:
threshold = 0.5
y_pred_binary = (y_pred > threshold).astype(int)
y_pred_binary = y_pred_binary.flatten()
y_pred_binary

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
y_test_binary = y_test.values
y_test_binary

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)
f1 = f1_score(y_test_binary, y_pred_binary)
roc_auc = roc_auc_score(y_test_binary, y_pred)

In [None]:
# Calculating confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)

# Display results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.9425
Precision: 0.9481012658227848
Recall: 0.93625
F1 Score: 0.9421383647798742
ROC AUC Score: 0.986199609375

Confusion Matrix:
 [[1518   82]
 [ 102 1498]]


## **Bi-Directional LSTM**

In [None]:
from tensorflow.keras.layers import Bidirectional

In [None]:
BDRNN_model = Sequential()
BDRNN_model.add(Embedding(len(word_index) + 1,
                          300,
                          weights=[embedding_matrix],
                          input_length=max_len,
                          trainable=False))
BDRNN_model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

BDRNN_model.add(Dense(1,activation='sigmoid'))
BDRNN_model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])


BDRNN_model.summary()



Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 30, 300)           6903900   
                                                                 
 bidirectional (Bidirection  (None, 600)               1442400   
 al)                                                             
                                                                 
 dense_3 (Dense)             (None, 1)                 601       
                                                                 
Total params: 8346901 (31.84 MB)
Trainable params: 1443001 (5.50 MB)
Non-trainable params: 6903900 (26.34 MB)
_________________________________________________________________


Regular dropout operates on connections between units within a specific timestep, while recurrent dropout operates on recurrent connections between timesteps. Combining both can help reduce overfitting and improve the generalization of recurrent models.

In [None]:
BDRNN_model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_valid_pad, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7def3dfe9b10>

In [None]:
y_pred = GRU_model.predict(X_test_pad)



In [None]:
threshold = 0.5
y_pred_binary = (y_pred > threshold).astype(int)
y_pred_binary = y_pred_binary.flatten()
y_pred_binary

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
y_test_binary = y_test.values
y_test_binary

array([1, 0, 0, ..., 0, 0, 0])

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)
f1 = f1_score(y_test_binary, y_pred_binary)
roc_auc = roc_auc_score(y_test_binary, y_pred)

In [None]:
# Calculating confusion matrix
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)

# Display results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.9425
Precision: 0.9481012658227848
Recall: 0.93625
F1 Score: 0.9421383647798742
ROC AUC Score: 0.986199609375

Confusion Matrix:
 [[1518   82]
 [ 102 1498]]


## **Transformer & BERT**

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers

from tokenizers import BertWordPieceTokenizer

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []

    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])

    return np.array(all_ids)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(df.headline, df.clickbait,
                                                    stratify=df.clickbait.values,
                                                    random_state = 42,
                                                    test_size = 0.2,
                                                    shuffle=True)

X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp,
                                                stratify=y_temp.values,
                                                random_state = 42,
                                                test_size = 0.5,
                                                shuffle=True)

print("Shape X_train:", X_train.shape)
print("Shape y_train:", y_train.shape)
print("Shape X_val:", X_val.shape)
print("Shape y_val:", y_val.shape)
print("Shape X_test:", X_test.shape)
print("Shape y_test:", y_test.shape)

Shape X_train: (25600,)
Shape y_train: (25600,)
Shape X_val: (3200,)
Shape y_val: (3200,)
Shape X_test: (3200,)
Shape y_test: (3200,)


**Tokenization**

In [None]:
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=False, wordpieces_prefix=##)

In [None]:
#IMP DATA FOR CONFIG
AUTO = tf.data.experimental.AUTOTUNE


# Configuration
EPOCHS = 3
BATCH_SIZE = 64
MAX_LEN = 30

In [None]:
# Using fast_encode on text data
X_train_encoded = fast_encode(X_train.astype(str), fast_tokenizer, maxlen=MAX_LEN)
X_val_encoded = fast_encode(X_val.astype(str), fast_tokenizer, maxlen=MAX_LEN)
X_test_encoded = fast_encode(X_test.astype(str), fast_tokenizer, maxlen=MAX_LEN)

y_train = y_train.values
y_test = y_test.values
y_val = y_val.values

100%|██████████| 100/100 [00:00<00:00, 102.23it/s]
100%|██████████| 13/13 [00:00<00:00, 101.90it/s]
100%|██████████| 13/13 [00:00<00:00, 97.89it/s]


In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train_encoded, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_val_encoded, y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_test_encoded)
    .batch(BATCH_SIZE)
)

In [None]:
def build_model(transformer, max_len=512):
    """
    function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)

    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
transformer_layer = (
    transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')
)
model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_word_ids (InputLayer  [(None, 30)]              0         
 )                                                               
                                                                 
 tf_distil_bert_model_4 (TF  TFBaseModelOutput(last_   66362880  
 DistilBertModel)            hidden_state=(None, 30,             
                              768),                              
                              hidden_states=None, at             
                             tentions=None)                      
                                                                 
 tf.__operators__.getitem_4  (None, 768)               0         
  (SlicingOpLambda)                                              
                                                                 
 dense_8 (Dense)             (None, 1)                 769 

In [None]:
n_steps = X_train_encoded.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
n_steps = X_val_encoded.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
y_pred = model.predict(test_dataset, verbose=1)




In [None]:
y_pred.flatten()

array([0.45395052, 0.45395052, 0.45395052, ..., 0.45395052, 0.45395052,
       0.45395052], dtype=float32)

In [None]:
# Getting unique values
unique_values = np.unique(y_pred)

print("Unique Values in the Array:", unique_values)

Nilai Unik dalam Array: [0.45395052]
