In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers, optimizers
from keras.models import Sequential
from keras.layers import Activation, Dense, Flatten, Dropout, LSTM, Softmax, Bidirectional, LayerNormalization, BatchNormalization
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print("GPUs Available: ", tf.config.list_physical_devices('GPU'))

# Load Data

In [None]:
train_path = "data/word_embed/train"
valid_path = "data/word_embed/valid"
test_path = "data/word_embed/test"

train_df = pd.DataFrame(columns=['text', 'stars', 'word_embed'])
valid_df = pd.DataFrame(columns=['text', 'stars', 'word_embed'])
test_df = pd.DataFrame(columns=['text', 'word_embed'])

# load train data
print("loading train data...")
for file in os.listdir(train_path):
    df = pd.read_pickle(os.path.join(train_path, file))
    train_df = train_df.append(df, ignore_index=True)
    
# load valid data
print("loading valid data...")
for file in os.listdir(valid_path):
    df = pd.read_pickle(os.path.join(valid_path, file))
    valid_df = valid_df.append(df, ignore_index=True)

# load test data
print("loading test data...")
for file in os.listdir(test_path):
    df = pd.read_pickle(os.path.join(test_path, file))
    test_df = test_df.append(df, ignore_index=True)

# convert to numpy array
train_X = np.array(train_df['word_embed'].tolist())
train_y = np.array(train_df['stars'].tolist())
train_y = tf.keras.utils.to_categorical(train_y-1, num_classes = 5)

valid_X = np.array(valid_df['word_embed'].tolist())
valid_y = np.array(valid_df['stars'].tolist())
valid_y = tf.keras.utils.to_categorical(valid_y-1, num_classes = 5)

test_X = np.array(test_df['word_embed'].tolist())

In [None]:
print(f"{train_X.shape=}")
print(f"{train_y.shape=}")
print(f"{valid_X.shape=}")
print(f"{valid_y.shape=}")
print(f"{test_X.shape=}")

In [None]:
plt.hist(np.argmax(train_y, axis=1), bins=5, color='blue', alpha=0.5, label='train')
plt.hist(np.argmax(valid_y, axis=1), bins=5, color='red', alpha=0.5, label='valid')
plt.title('Distribution of Stars')

plt.legend(loc='upper left')
plt.show()

# Build Model

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True, activation='relu'), input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(LayerNormalization())
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(LayerNormalization())
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=optimizers.Adam(learning_rate=0.0005),
              metrics=['accuracy'])
print(model.summary())

# Train Model

In [None]:
ACCURACY_THRESHOLD = 0.99
class accuryThreasholdCallback(tf.keras.callbacks.Callback): 
    def on_epoch_end(self, epoch, logs={}): 
        if(logs.get('accuracy') > ACCURACY_THRESHOLD and logs.get('val_accuracy') > ACCURACY_THRESHOLD):   
            print("\nReached %2.2f%% accuracy, so stopping training!!" %(ACCURACY_THRESHOLD*100))   
            self.model.stop_training = True

accuracy_threashold_monitor = accuryThreasholdCallback()
early_stopping_monitor = EarlyStopping(patience=3)

history = model.fit(train_X, train_y, epochs=15, batch_size=32, validation_data=(valid_X, valid_y), callbacks=[accuracy_threashold_monitor, early_stopping_monitor])

# Evaluate Model

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(history.history['accuracy'])
ax[0].plot(history.history['val_accuracy'])
ax[0].set_ylabel('accuracy')
ax[0].set_xlabel('epoch')
ax[0].legend(['train', 'val'], loc='upper right')

ax[1].plot(history.history['loss'])
ax[1].plot(history.history['val_loss'])
ax[1].set_ylabel('loss')
ax[1].set_xlabel('epoch')
ax[1].legend(['train', 'val'], loc='upper right')
fig.suptitle('Model training history')
plt.show()

In [None]:
valid_pred_y = model.predict(valid_X)

y_test_not_onehot = np.argmax(valid_y, axis=1)
y_pred_not_onehot = np.argmax(valid_pred_y, axis=1)
plt.figure(figsize=(5, 5))
matrix_confusion = confusion_matrix(y_pred_not_onehot, y_test_not_onehot)
sns.heatmap(matrix_confusion, square=True, annot=False, cmap='Blues', fmt='d')

for i in range(matrix_confusion.shape[0]):
    for j in range(matrix_confusion.shape[1]):
        plt.text(j+0.5, i+0.5, f'{matrix_confusion[i, j]}/{np.sum(matrix_confusion[i, :])}', 
                 horizontalalignment='center', verticalalignment='center', fontsize=7)

plt.xlabel('predictions')
plt.ylabel('ground truth')
plt.show()

In [None]:
valid_y_numerical = np.argmax(valid_y, axis=1)
valid_pred_y_numerical = np.argmax(valid_pred_y, axis=1)

print(classification_report(valid_y_numerical, valid_pred_y_numerical))

In [None]:
# training
print("@@@@@@ model prediction on training set @@@@@@")
index  = np.random.choice(train_X.shape[0])

print(f"text: {train_text[index]}")

train_stars = np.argmax(train_y[index], axis=0) + 1
print(f"ground truth: {train_stars}")

train_stars_pred = model.predict(train_X[index].reshape(1, train_X[index].shape[0], train_X[index].shape[1]), verbose=0)
train_stars_pred = np.argmax(train_stars_pred) + 1
print(f"model pred: {train_stars_pred}")

print("\n")
# testing
print("@@@@@@ model prediction on testing set @@@@@@")
index = np.random.choice(test_X.shape[0])

print(f"text: {test_text[index]}")

test_stars_pred = model.predict(test_X[index].reshape(1, test_X[index].shape[0], test_X[index].shape[1]), verbose=0)
test_stars_pred = np.argmax(test_stars_pred) + 1
print(f"model pred: {test_stars_pred}")

In [None]:
# model.save(f"weights/model.h5")

# Prediction on Test Data

In [None]:
valid_pred = model.predict(valid_X, verbose=1)
test_pred = model.predict(test_X, verbose=1)

In [None]:
valid_pred_df = pd.DataFrame(valid_df)
valid_pred_df['stars'] = np.argmax(valid_pred, axis=1) + 1
print(valid_pred_df[["text", "stars"]].head())

test_pred_df = pd.DataFrame(test_df)
test_pred_df['stars'] = np.argmax(test_pred, axis=1) + 1
print(test_pred_df[["review_id", "text", "stars"]].head())

In [None]:
valid_pred_df[["text", "stars"]].to_csv("data/valid_pred.csv", index=False)
test_df[["review_id", "text", "stars"]].to_csv("data/pred.csv", index=False)