In [158]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [159]:
data = pd.read_csv('data/review_clean.csv')
data = data.sample(frac=1, random_state=42)
data

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
x = data['combined_review'].values
y = data['sentiment_label'].values

In [None]:
review_train, review_test, label_train, label_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
review_train

array(['soul place staff',
       'everything amazing staff fab hotel room nice cosy bedding pillow fantastic',
       'small issue room way hot found difficult manoeuvre window get fresh air room eventually addressed pretty noisy window open price pay middle vibrant city minor issue discussed addressed satisfaction location hotel brilliant heart gothic quarter opposite barcelona cathedral make great option explore amazing city room comfortable reasonably spacious included nice toiletry requested twin bed travelling adult daughter problem comfortable breakfast really good lot choice delicious pastry doughnut',
       ...,
       'hall elevator area first impression need replacement stained worn food option expensive breakfast none nice room modern decor amenity',
       'bathroom little basic paper tissue minimum soap shampoo easy get metro station right front hotel friendly helpful staff',
       'one boat day city room comfortable shower good secure parking site nice'],
      dtype=o

In [None]:
review_test

array(['small car park breakfast room size lobby',
       'wherever expected something importantly little thing never expected found delighted lastly staff amazing',
       'location great tower bridge area getting theatre etc difficult felt sorry cab driver journey horrendous fault hotel great location score determined planning stay room lovely exec double large bed great shower comfy spa better expected including steam room long pool treatment also available',
       ...,
       'bed soft course matter personal preference pity hotel description say staff speaks italian lucky receptionist perhaps colleague minor imperfection excellent location quiet street u bahn stop round corner caf shopping dining opportunity neighbourhood clean practically furnished room big enough lot storage space personal thing',
       'comfy bed nice breakfast great location close underground lovely staff smiley obliging',
       'nothing fabulous staff excellent room good location'],
      dtype=object)

In [None]:
label_train

array(['Very Positive', 'Very Positive', 'Positive', ..., 'Positive',
       'Positive', 'Positive'], dtype=object)

In [None]:
label_test

array(['Positive', 'Very Positive', 'Positive', ..., 'Positive',
       'Positive', 'Very Positive'], dtype=object)

In [None]:
label_train = pd.get_dummies(label_train, columns=['sentiment_label'])
label_test = pd.get_dummies(label_test, columns=['sentiment_label'])


In [None]:
label_train = np.array(label_train)
label_train

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0]], dtype=uint8)

In [None]:
label_test = np.array(label_test)
label_test

array([[0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       ...,
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1]], dtype=uint8)

In [None]:
review_train

array(['soul place staff',
       'everything amazing staff fab hotel room nice cosy bedding pillow fantastic',
       'small issue room way hot found difficult manoeuvre window get fresh air room eventually addressed pretty noisy window open price pay middle vibrant city minor issue discussed addressed satisfaction location hotel brilliant heart gothic quarter opposite barcelona cathedral make great option explore amazing city room comfortable reasonably spacious included nice toiletry requested twin bed travelling adult daughter problem comfortable breakfast really good lot choice delicious pastry doughnut',
       ...,
       'hall elevator area first impression need replacement stained worn food option expensive breakfast none nice room modern decor amenity',
       'bathroom little basic paper tissue minimum soap shampoo easy get metro station right front hotel friendly helpful staff',
       'one boat day city room comfortable shower good secure parking site nice'],
      dtype=o

In [None]:
review_test

array(['small car park breakfast room size lobby',
       'wherever expected something importantly little thing never expected found delighted lastly staff amazing',
       'location great tower bridge area getting theatre etc difficult felt sorry cab driver journey horrendous fault hotel great location score determined planning stay room lovely exec double large bed great shower comfy spa better expected including steam room long pool treatment also available',
       ...,
       'bed soft course matter personal preference pity hotel description say staff speaks italian lucky receptionist perhaps colleague minor imperfection excellent location quiet street u bahn stop round corner caf shopping dining opportunity neighbourhood clean practically furnished room big enough lot storage space personal thing',
       'comfy bed nice breakfast great location close underground lovely staff smiley obliging',
       'nothing fabulous staff excellent room good location'],
      dtype=object)

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(review_train)

train_sequences = tokenizer.texts_to_sequences(review_train)
test_sequences = tokenizer.texts_to_sequences(review_test)

In [None]:
maxlen = 100
train_padded = pad_sequences(train_sequences, 
                            padding='post',
                            maxlen=maxlen,
                            truncating='post')

test_padded = pad_sequences(test_sequences,
                            padding='post',
                            maxlen=maxlen,
                            truncating='post')

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=32, input_length=maxlen),
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Flatten(),
    
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
            loss='categorical_crossentropy',
            metrics=['accuracy'])

  super().__init__(name, **kwargs)


In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 32)           320000    
                                                                 
 bidirectional (Bidirectiona  (None, 100, 256)         164864    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              164352    
 nal)                                                            
                                                                 
 dense_15 (Dense)            (None, 128)               16512     
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_16 (Dense)            (None, 64)               

In [None]:

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if logs.get('accuracy') >= 0.9:
            print("\nAkurasi telah mencapai >= 90%")
            self.model.stop_training = True
            
callbacks = myCallback()


In [None]:
model.fit(train_padded, label_train, 
        epochs=100, 
        batch_size=128, 
        validation_data=(test_padded, label_test), 
        callbacks=[callbacks])

Epoch 1/100
 126/3221 [>.............................] - ETA: 45:10 - loss: 1.1308 - accuracy: 0.4837

KeyboardInterrupt: 