In [1]:
# GET the data
## Memuat Data
import pandas as pd

dataset = pd.read_csv("Capres2014-1.1.csv", usecols=["Isi_Tweet", "Sentimen"])

In [2]:
# EXPLORE the data
## Menampilkan lima data terakhir
dataset.tail()

Unnamed: 0,Isi_Tweet,Sentimen
1880,Jangan kabur dari tanggung jawab dengan kemasa...,1
1881,@echo_hadiwibowo mana berani pmrntah..m G da y...,1
1882,@IndonesiaCapres ANAK MEDAN DUKUNG CAPRES JK D...,1
1883,"RT @idoidonajib: Jelek! ""@fallenokta: Apa komp...",1
1884,"Langsung deh ngadu ke capres Hatta Rajasa, bia...",1


In [3]:
## Mengecek Imbalanced Data
dataset['Sentimen'].value_counts()

 1    1117
-1     768
Name: Sentimen, dtype: int64

In [4]:
## Mengganti {-1,1} menjadi {0,1}
dataset['Sentimen'] = dataset['Sentimen'].replace(-1,0)

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

#Setting indonesian stopwords
stop_words = set(stopwords.words('indonesian'))

#Stemming indonesian words
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# MODEL the data
## Pra Pengolahan - Cleaning
import numpy as np
import re
import string

def clean_text(tweet):
    
    # Convert to lower case
    tweet = tweet.lower()
    # remove unicode characters
    tweet = tweet.encode('ascii', 'ignore').decode()
    # Clean www.* or https?://*
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    # Clean @username
    tweet = re.sub('@[^\s]+','',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    #Remove common Indonesian stop words and stemming Indonesian words
    tweet_tokens = tweet.split()
    #Removing Stop Words
    filtered_words = [word for word in tweet_tokens if word not in stop_words]
    #Stemming Word
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    tweet = ' '.join(stemmed_words)
    #trim
    tweet = tweet.strip('\'"')
    
    return tweet

dataset["Isi_Tweet"] = dataset['Isi_Tweet'].map(lambda x: clean_text(x))
dataset = dataset[dataset['Isi_Tweet'].apply(lambda x: len(x.split()) >=1)]
dataset.shape

(1885, 2)

In [32]:
print(dataset)


(1885, 2)


In [8]:
## Pra Pengolahan - Splitting
from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(
    dataset['Isi_Tweet'], dataset['Sentimen'], test_size=0.2, random_state=42)

In [9]:
!pip install transformers



In [34]:
!pip show transformers

Name: transformers
Version: 4.26.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache
Location: c:\users\dell\anaconda3\lib\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, tokenizers, tqdm
Required-by: 


In [65]:
from tensorflow import keras
from transformers import AutoTokenizer, TFAutoModel
import IPython

bert_tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")
def tokenisasi(teks):
    encode_dict = bert_tokenizer(teks,
                                   add_special_tokens = True,
                                   max_length = 128, 
                                   padding = 'max_length',
                                   truncation = True,
                                   return_attention_mask = True,
                                   return_tensors = 'tf',)

    tokenID = encode_dict['input_ids']
    attention_mask = encode_dict['attention_mask']
    return tokenID, attention_mask

def create_input(data):
    tokenID, input_mask = [], []
    for teks in data:
        token, mask = tokenisasi(teks)
        tokenID.append(token)
        input_mask.append(mask)
    
    return [np.asarray(tokenID, dtype=np.int32).reshape(-1, 128), 
            np.asarray(input_mask, dtype=np.int32).reshape(-1, 128)]

bert_model = TFAutoModel.from_pretrained("indobenchmark/indobert-base-p2", trainable=False)

def bert(hp):
    
    #Input layer
    input_token = keras.layers.Input(shape=(128,), dtype=np.int32,
                                        name="input_token")
    input_mask = keras.layers.Input(shape=(128,), dtype=np.int32,
                                   name="input_mask")

    #Embedding
    bert_embedding = bert_model([input_token, input_mask])[0]
    
    
    # Attention mechanism
    num_heads = hp.Int('num_heads', min_value=2, max_value=8, step=2)
    attention = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=128)(bert_embedding, bert_embedding, bert_embedding)
    add_attention = keras.layers.Add()([bert_embedding, attention])
    layer_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)(add_attention)
    
    #Dropout Layer
    dropout_rate = 0.2
    dropout_layer = keras.layers.Dropout(dropout_rate)(layer_norm1)

    #Output layer
    output = keras.layers.Dense(1, activation='sigmoid',
                                kernel_regularizer=keras.regularizers.l2(hp.Choice('kernel_dense', values = [0.01, 0.001])))(dropout_layer)
    
    
    #Adjust Learning Rates
    learning_rate = 1e-3
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        learning_rate,
        decay_steps=1000,
        decay_rate=0.95,
        staircase=True
    )
    
    #Model Compiler
    model = keras.models.Model(inputs=[input_token, input_mask], outputs=output)

    model.compile(optimizer = keras.optimizers.Adam(lr_schedule),
                  loss ='binary_crossentropy',
                  metrics=['accuracy'])
   
    return model

class ClearTrainingOutput(keras.callbacks.Callback):
    def on_train_end(*args, **kwargs):
        IPython.display.clear_output(wait = True)

early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

Some layers from the model checkpoint at indobenchmark/indobert-base-p2 were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at indobenchmark/indobert-base-p2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [11]:
!pip install keras-tuner



In [None]:
from keras_tuner.tuners import BayesianOptimization

bert_train_data = create_input(train_data)
bert_test_data = create_input(test_data)

tuner = BayesianOptimization(bert,
                             objective = 'val_accuracy', 
                             max_trials = 10,
                             directory = '/content/Hasil',
                             project_name = 'Sentiment-BERT',
                             overwrite = True)

tuner.search(bert_train_data, train_labels,
             batch_size=256, epochs = 50,
             validation_data=(bert_test_data, test_labels),
             callbacks=[early_stop, ClearTrainingOutput()])

# Mendapatkan model terbaik
model = tuner.get_best_models()[0]

Trial 3 Complete [01h 38m 39s]
val_accuracy: 0.8201259970664978

Best val_accuracy So Far: 0.8433355689048767
Total elapsed time: 04h 14m 50s

Search: Running Trial #4

Value             |Best Value So Far |Hyperparameter
6                 |8                 |num_heads
0.01              |0.001             |kernel_dense

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50

In [None]:
## Evaluasi Model

test_loss, test_acc = model.evaluate(bert_test_data, test_labels)
print('Test accuracy:', test_acc)

In [None]:
## Penyimpanan dan Memuat Kembali Model
model.save('Data/model_mlp_sentiment.h5')

model = keras.models.load_model('Data/model_mlp_sentiment.h5')