In [40]:
import pandas as pd
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Bidirectional

In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
cd drive/MyDrive/SIAP/dataset/clean

[Errno 2] No such file or directory: 'drive/MyDrive/SIAP/dataset/clean'
/content/drive/MyDrive/SIAP/dataset/clean


In [43]:
df_train = pd.read_csv('train_clean_removed_emoticons.csv')
df_validation = pd.read_csv('validation_clean_removed_emoticons.csv')

X_train = df_train['Review Text']
y_train = df_train['Rating']
X_validation = df_validation['Review Text']
y_validation = df_validation['Rating']

In [44]:
len(X_train)

18113

In [45]:
#Hyper parameters


vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 18113


In [46]:
# convert df to list

sentences_training = df_train['Review Text'].astype(str).str.lower().values.tolist()
labels_training = df_train['Rating'].values.tolist()

sentences_validation = df_validation['Review Text'].astype(str).str.lower().values.tolist()
labels_validation = df_validation['Rating'].values.tolist()


In [47]:
training_sentences = sentences_training
validation_sentences = sentences_validation
training_labels = labels_training
validation_labels = labels_validation


In [48]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

validation_sequences = tokenizer.texts_to_sequences(validation_sentences)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [49]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
validation_padded = np.array(validation_padded)
validation_labels = np.array(validation_labels)

In [50]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    #tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Bidirectional(LSTM(16, return_sequences=True)),
    tf.keras.layers.Bidirectional(LSTM(16)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 100, 16)           160000    
                                                                 
 bidirectional_6 (Bidirectio  (None, 100, 32)          4224      
 nal)                                                            
                                                                 
 bidirectional_7 (Bidirectio  (None, 32)               6272      
 nal)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 32)                0         
                                                                 
 dense_6 (Dense)             (None, 24)                792       
                                                                 
 dropout_7 (Dropout)         (None, 24)               

In [51]:
num_epochs = 30
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(validation_padded, validation_labels), verbose=2)

Epoch 1/30
567/567 - 55s - loss: -4.7652e+02 - accuracy: 0.0362 - val_loss: -1.2852e+03 - val_accuracy: 0.0363 - 55s/epoch - 97ms/step
Epoch 2/30
567/567 - 46s - loss: -2.7832e+03 - accuracy: 0.0363 - val_loss: -4.6568e+03 - val_accuracy: 0.0363 - 46s/epoch - 82ms/step
Epoch 3/30
567/567 - 46s - loss: -7.2929e+03 - accuracy: 0.0363 - val_loss: -1.0228e+04 - val_accuracy: 0.0363 - 46s/epoch - 82ms/step
Epoch 4/30


KeyboardInterrupt: ignored