In [28]:
import pandas as pd
import nltk
import re

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory 

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch

from transformers import  BertTokenizer, TFBertForSequenceClassification, TFAutoModel, BertConfig, AutoModel, AutoTokenizer, TFBertModel,  TFAutoModelForSequenceClassification

In [2]:
df = pd.read_csv(r'dataset\balance.csv')
df.head()

Unnamed: 0,Rating,Description
0,5,Klenteng Ban Hin Kiong merupakan Klenteng tert...
1,5,Airnya sejuk. Tempatnya bebas plastik. Bagi ya...
2,5,Tiap minggu pasti kesini buat foto2 doangüòÅ kar...
3,5,"Pernah kesana pergi liat bunker jepang, naik p..."
4,5,"Mengikuti Talkshow "" Menyingkap Pesona Wastra ..."


In [3]:
# ubah rating
mapping = {1:0,2:0,3:1,4:2,5:2}

df['Rating'] = df['Rating'].replace(mapping)
df['Rating'].unique()

array([2, 1, 0], dtype=int64)

In [4]:
# preprocess text

factory = StemmerFactory()
stemmer = factory.create_stemmer()

stopword_factory = StopWordRemoverFactory()
stopwords = stopword_factory.get_stop_words()

def clean_text(text):
    text = text.dropna()

def preprocess_text(text):
    # pastikan dalam bentuk string
    text = str(text)

    # ubah menjadi lower case
    text = text.lower()

    # Menghapus handles
    text = re.sub(r'\@\w+|\#','', text)

    # menghapus angka dari text
    text = re.sub(r"\d+", "", text)

    # Menghapus tanda baca
    text = re.sub(r'[^\w\s]', '', text)

    # menghapus spasi awal dan akhir kalimat
    words = text.split()

    # menghapus stopwords dan stemming
    processed_word = []
    for word in words:
        if word not in stopwords:
            stemmed_word = stemmer.stem(word)
            processed_word.append(stemmed_word)

    return ' '.join(processed_word)



In [5]:
# df['preprocessed_text'] = df['Description'].apply(preprocess_text)
# df.head()

In [6]:
# df.to_csv("dataset\preprocess.csv", index=False)
df = pd.read_csv(r'dataset\preprocess.csv')
df.head()

Unnamed: 0,Rating,Description,preprocessed_text
0,2,Klenteng Ban Hin Kiong merupakan Klenteng tert...,klenteng ban hin kiong rupa klenteng tua manad...
1,2,Airnya sejuk. Tempatnya bebas plastik. Bagi ya...,air sejuk tempat bebas plastik kesana pergi ke...
2,2,Tiap minggu pasti kesini buat foto2 doangüòÅ kar...,tiap minggu kesini buat foto doang emang bagus...
3,2,"Pernah kesana pergi liat bunker jepang, naik p...",pernah kesana pergi liat bunker jepang naik pe...
4,2,"Mengikuti Talkshow "" Menyingkap Pesona Wastra ...",ikut talkshow singkap pesona wastra indonesia ...


In [7]:
df = df.dropna()
df.isna().sum()

Rating               0
Description          0
preprocessed_text    0
dtype: int64

Tokenisasi and padding

In [8]:
df = df[['Rating', 'preprocessed_text']]
df.rename(columns={"Rating" : "label", "preprocessed_text" : "text"}, inplace=True)
df.head()

Unnamed: 0,label,text
0,2,klenteng ban hin kiong rupa klenteng tua manad...
1,2,air sejuk tempat bebas plastik kesana pergi ke...
2,2,tiap minggu kesini buat foto doang emang bagus...
3,2,pernah kesana pergi liat bunker jepang naik pe...
4,2,ikut talkshow singkap pesona wastra indonesia ...


In [9]:
X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.15, random_state=42)

print(f"Train size {X_train.shape}")
print(f"Test size {X_test.shape}")

Train size (634,)
Test size (112,)


tokenizer manual vs bert

In [10]:
# membuat tokenizer
# tokenizer = Tokenizer(num_words=10000)
# tokenizer.fit_on_texts(X_train)
# X_train_sequence = tokenizer.texts_to_sequences(X_train)
# X_test_sequence = tokenizer.texts_to_sequences(X_test)

In [29]:
config = BertConfig.from_pretrained("indolem/indobert-base-uncased", output_attentions=True, from_pt = True)
tokenizer = BertTokenizer.from_pretrained("indolem/indobert-base-uncased", from_pt = True)
model = TFBertModel.from_pretrained("indolem/indobert-base-uncased", from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predict

In [30]:
# Tokenisasi Data
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [31]:
# # membuat padding
# max_length = max([len(seq) for seq in X_train_sequence])
# X_train_padded = pad_sequences(X_train_sequence, maxlen=max_length, padding='post', truncating='post')
# X_test_padded = pad_sequences(X_test_sequence, maxlen=max_length, padding='post', truncating='post')


# print(f"X_train_padded shape : {X_train_padded.shape}")
# print(f"X_test_padded shape : {X_test_padded.shape}")

In [42]:
# Mempersiapkan Data untuk Model
batch_size = 2
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train.tolist()
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test.tolist()
))

train_dataset = train_dataset.shuffle(len(X_train)).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

Model dengan embedding layer

In [43]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(input_dim=10000, output_dim=32, input_length=max_length),
#     tf.keras.layers.Conv1D(128, 5, activation='relu'),
#     tf.keras.layers.MaxPooling1D(5),
#     tf.keras.layers.Conv1D(64, 5, activation='relu'),
#     tf.keras.layers.GlobalMaxPooling1D(),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dropout(0.5),
#     tf.keras.layers.Dense(3, activation='softmax')
# ])

In [44]:
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(input_dim=10000, output_dim=16, input_length=max_length),
#     tf.keras.layers.GlobalAveragePooling1D(),
#     tf.keras.layers.Dense(16, activation='relu'),
#     tf.keras.layers.Dropout(0.5),
#     tf.keras.layers.Dense(3, activation='softmax')
# ])

compile model

In [45]:
# Membangun dan Melatih Model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

train model

In [46]:
model.fit(train_dataset,
          epochs=3,
          validation_data=test_dataset)

Epoch 1/3


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


StagingError: in user code:

    File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None

    StagingError: Exception encountered when calling layer "tf_bert_model_2" "                 f"(type TFBertModel).
    
    in user code:
    
        File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 859, in call  *
            outputs = self.bert(
        File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
    
        StagingError: Exception encountered when calling layer "bert" "                 f"(type TFBertMainLayer).
        
        in user code:
        
            File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 622, in call  *
                embedding_output = self.embeddings(
            File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
                raise e.with_traceback(filtered_tb) from None
        
            StagingError: Exception encountered when calling layer "embeddings" "                 f"(type TFBertEmbeddings).
            
            in user code:
            
                File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 191, in call  *
                    return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training)
                File "c:\ProgramData\anaconda3\envs\capstone\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 206, in _embedding  *
                    seq_length = input_shape[1]
            
                IndexError: list index out of range
            
            
            Call arguments received by layer "embeddings" "                 f"(type TFBertEmbeddings):
              ‚Ä¢ input_ids=tf.Tensor(shape=(361,), dtype=int32)
              ‚Ä¢ position_ids=None
              ‚Ä¢ token_type_ids=tf.Tensor(shape=(361,), dtype=int32)
              ‚Ä¢ inputs_embeds=None
              ‚Ä¢ mode=embedding
              ‚Ä¢ training=True
        
        
        Call arguments received by layer "bert" "                 f"(type TFBertMainLayer):
          ‚Ä¢ input_ids=tf.Tensor(shape=(361,), dtype=int32)
          ‚Ä¢ attention_mask=tf.Tensor(shape=(361,), dtype=int32)
          ‚Ä¢ token_type_ids=tf.Tensor(shape=(361,), dtype=int32)
          ‚Ä¢ position_ids=None
          ‚Ä¢ head_mask=None
          ‚Ä¢ inputs_embeds=None
          ‚Ä¢ output_attentions=False
          ‚Ä¢ output_hidden_states=False
          ‚Ä¢ return_dict=True
          ‚Ä¢ training=True
          ‚Ä¢ kwargs=<class 'inspect._empty'>
    
    
    Call arguments received by layer "tf_bert_model_2" "                 f"(type TFBertModel):
      ‚Ä¢ input_ids={'input_ids': 'tf.Tensor(shape=(361,), dtype=int32)', 'token_type_ids': 'tf.Tensor(shape=(361,), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(361,), dtype=int32)'}
      ‚Ä¢ attention_mask=None
      ‚Ä¢ token_type_ids=None
      ‚Ä¢ position_ids=None
      ‚Ä¢ head_mask=None
      ‚Ä¢ inputs_embeds=None
      ‚Ä¢ output_attentions=None
      ‚Ä¢ output_hidden_states=None
      ‚Ä¢ return_dict=None
      ‚Ä¢ training=True
      ‚Ä¢ kwargs=<class 'inspect._empty'>


In [None]:
# Evaluasi model pada test set
# loss, accuracy = model.evaluate(X_test_padded, y_test)
# print(f"Test Accuracy: {accuracy:.4f}")