# Resturant Review Sentiment - LSTM and Neural Network Ensemble
### Matthew Newton
* To better capture the meaning in the text data, a Long Short-Term Memory (LSTM) network can be used and can be more efficient to train than transformers.
* They can also be trained alongside another neural network that captures the categorical and numerical features, creating an ensemble model.
* A pre-trained embedding layer using GloVe is defined followed by an LSTM layer, the output of this layer is concatenated with the categorical and numerical features and parsed through a fully-connected layer, a softmax function is used to create the output layer of the network.

In [1]:
import pandas as pd
import pickle
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df_review = pd.read_pickle("./cleaned_data/reviews_cleaned_nltk.pickle")

In [3]:
# Split training data into train data, cross validation and test data
df_review = df_review.dropna()
df_review = df_review[:2500]
df_review['review'] = df_review['title'] + " " + df_review['text']
features = ['review', 'type', 'priceInterval', 'date', 'review_length', 'rest_rating']
X_train, X_cv, y_train, y_cv = train_test_split(df_review[features], df_review['rating'], test_size = 0.30, random_state = 0)
X_cv, X_test, y_cv, y_test = train_test_split(X_cv[features], y_cv, test_size = 0.50, random_state = 0)

In [4]:
y_train = np.array(y_train) - 1
y_cv = np.array(y_cv) - 1
y_test = np.array(y_test) - 1

In [5]:
# Generic code to detect GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.set_visible_devices(gpus[0], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU setup successful.")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected.")

GPU setup successful.


In [7]:
# Preprocess text (review and title)
MAX_VOCAB_SIZE = 5000  # Limit the size of the vocabulary for efficiency
MAX_SEQUENCE_LENGTH = 200  # Limit review length

# Tokenize text
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X_train['review'])

train_sequences = tokenizer.texts_to_sequences(X_train['review'])
cv_sequences = tokenizer.texts_to_sequences(X_cv['review'])
test_sequences = tokenizer.texts_to_sequences(X_test['review'])

# Pad sequences to ensure consistent length
train_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
cv_padded = pad_sequences(cv_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_padded = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [8]:
# One-hot encode categorical metadata features (type etc.)
ohe_type = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
restaurant_types_train = ohe_type.fit_transform(X_train[['type']])
restaurant_types_cv = ohe_type.transform(X_cv[['type']])
restaurant_types_test = ohe_type.transform(X_test[['type']])

ohe_price = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
restaurant_price_train = ohe_price.fit_transform(X_train[['priceInterval']])
restaurant_price_cv = ohe_price.transform(X_cv[['priceInterval']])
restaurant_price_test = ohe_price.transform(X_test[['priceInterval']])

# Normalize price feature
scaler_date = StandardScaler()
restaurant_date_train = scaler_date.fit_transform(X_train[['date']])
restaurant_date_cv = scaler_date.transform(X_cv[['date']])
restaurant_date_test = scaler_date.transform(X_test[['date']])

scaler_length = StandardScaler()
restaurant_length_train = scaler_length.fit_transform(X_train[['review_length']])
restaurant_length_cv = scaler_length.transform(X_cv[['review_length']])
restaurant_length_test = scaler_length.transform(X_test[['review_length']])

scaler_rating = StandardScaler()
restaurant_rating_train = scaler_rating.fit_transform(X_train[['rest_rating']])
restaurant_rating_cv = scaler_rating.transform(X_cv[['rest_rating']])
restaurant_rating_test = scaler_rating.transform(X_test[['rest_rating']])

# Concatenate metadata features
train_metadata = pd.concat([pd.DataFrame(restaurant_types_train),
                            pd.DataFrame(restaurant_price_train),
                            pd.DataFrame(restaurant_date_train),
                            pd.DataFrame(restaurant_length_train),
                            pd.DataFrame(restaurant_rating_train)
                           ], axis=1)
cv_metadata = pd.concat([pd.DataFrame(restaurant_types_cv),
                            pd.DataFrame(restaurant_price_cv),
                            pd.DataFrame(restaurant_date_cv),
                            pd.DataFrame(restaurant_length_cv),
                            pd.DataFrame(restaurant_rating_cv)
                           ], axis=1)
test_metadata = pd.concat([pd.DataFrame(restaurant_types_test),
                            pd.DataFrame(restaurant_price_test),
                            pd.DataFrame(restaurant_date_test),
                            pd.DataFrame(restaurant_length_test),
                            pd.DataFrame(restaurant_rating_test)
                           ], axis=1)

In [9]:
def load_glove_embeddings(glove_file_path, embedding_dim, word_index):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    # Create the embedding matrix
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        if i > len(word_index):
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [10]:
word_index = tokenizer.word_index
glove_file_path = './glove/glove.6B.100d.txt'
embedding_dim = 100
embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, word_index)

In [40]:
# Text input (LSTM branch)
TEXT_INPUT_DIM = len(word_index)+1 #MAX_VOCAB_SIZE
EMBEDDING_DIM = 100  # Embedding dimension 
LSTM_UNITS = 64  # Number of LSTM units
NUM_HIDDEN = 64

# Input for the review text
text_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
# Embedding layer to learn word representations
embedding_layer = Embedding(input_dim=TEXT_INPUT_DIM,
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)(text_input)
# LSTM layer to process the text input
lstm_layer = LSTM(LSTM_UNITS, return_sequences=False)(embedding_layer)
lstm_layer = Dropout(0.5)(lstm_layer)

# Metadata input (Dense branch)
metadata_input = Input(shape=(train_metadata.shape[1],))
metadata_dense = Dense(NUM_HIDDEN, activation='leaky_relu', kernel_regularizer=regularizers.l2(0.1))(metadata_input) 
metadata_dense = Dropout(0.5)(metadata_dense)
metadata_dense = Dense(NUM_HIDDEN, activation='leaky_relu', kernel_regularizer=regularizers.l2(0.1))(metadata_dense) 
metadata_dense = Dropout(0.5)(metadata_dense)

# Concatenate the LSTM output and metadata dense layers
concatenated = Concatenate()([lstm_layer, metadata_dense])

# Add fully connected layers
dense_layer = Dense(NUM_HIDDEN, activation='leaky_relu', kernel_regularizer=regularizers.l2(0.1))(concatenated)
dense_layer = Dropout(0.5)(dense_layer)
dense_layer = Dense(NUM_HIDDEN, activation='leaky_relu', kernel_regularizer=regularizers.l2(0.1))(dense_layer)
dense_layer = Dropout(0.5)(dense_layer)
output = Dense(5, activation='softmax')(dense_layer)

# Model definition
model = Model(inputs=[text_input, metadata_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()




In [19]:
# To fix errors
from scipy.sparse import csr_matrix

# Convert csr_matrix to dense array
if isinstance(train_metadata, csr_matrix):
    train_metadata = train_metadata.toarray()

# Similarly for test_metadata
if isinstance(cv_metadata, csr_matrix):
    cv_metadata = cv_metadata.toarray()

# Similarly for test_metadata
if isinstance(test_metadata, csr_matrix):
    test_metadata = test_metadata.toarray()

In [20]:
# Convert to numpy and check sizes
train_metadata = train_metadata.to_numpy()
cv_metadata = cv_metadata.to_numpy()
test_metadata = test_metadata.to_numpy()
print(type(train_padded))       # Should be <class 'numpy.ndarray'>
print(type(train_metadata))     # Should be <class 'numpy.ndarray'>
print(type(y_train))            # Should be <class 'numpy.ndarray'>
print(train_padded.shape)       # Should be (num_samples, MAX_SEQUENCE_LENGTH)
print(train_metadata.shape)     # Should be (num_samples, num_metadata_features)
print(y_train.shape)

AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [41]:
# Train model
history = model.fit(
    [train_padded, train_metadata], y_train,
    validation_data=([cv_padded, cv_metadata], y_cv),
    epochs=50,
    batch_size=64
)

Epoch 1/50
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 112ms/step - accuracy: 0.5217 - loss: 31.5466 - val_accuracy: 0.6563 - val_loss: 0.8516
Epoch 2/50
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 108ms/step - accuracy: 0.6534 - loss: 0.8511 - val_accuracy: 0.6817 - val_loss: 0.7899
Epoch 3/50
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 108ms/step - accuracy: 0.6827 - loss: 0.7974 - val_accuracy: 0.6902 - val_loss: 0.7751
Epoch 4/50
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 109ms/step - accuracy: 0.6989 - loss: 0.7619 - val_accuracy: 0.6941 - val_loss: 0.7592
Epoch 5/50
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 109ms/step - accuracy: 0.7059 - loss: 0.7478 - val_accuracy: 0.7075 - val_loss: 0.7362
Epoch 6/50
[1m342/342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 109ms/step - accuracy: 0.7105 - loss: 0.7319 - val_accuracy: 0.7047 - val_loss: 0.7449
Epoch 7/5

In [42]:
# Predict on the test set
y_train_pred_proba = model.predict([train_padded, train_metadata])
y_train_pred = np.argmax(y_train_pred_proba, axis=1)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:\n", classification_report(y_train, y_train_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_train, y_train_pred))

[1m5469/5469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 5ms/step
Accuracy: 0.9603828571428571

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94     12291
           1       0.93      0.85      0.89     10900
           2       0.97      0.92      0.94     19591
           3       0.96      0.94      0.95     46842
           4       0.97      0.99      0.98     85376

    accuracy                           0.96    175000
   macro avg       0.95      0.94      0.94    175000
weighted avg       0.96      0.96      0.96    175000


Confusion Matrix:
 [[11995   264     7     5    20]
 [ 1216  9263   396    21     4]
 [   16   448 17985  1138     4]
 [    7     7   237 44022  2569]
 [   23     1     1   549 84802]]


In [43]:
# Predict on the test set
y_pred_proba = model.predict([test_padded, test_metadata])
y_pred = np.argmax(y_pred_proba, axis=1)  # For multi-class classification

print(y_pred)
print(y_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step
[4 4 4 ... 3 4 4]
[4 4 4 ... 3 4 4]
Accuracy: 0.6646133333333333

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.74      0.71      2651
           1       0.44      0.40      0.42      2390
           2       0.55      0.48      0.51      4113
           3       0.54      0.50      0.52     10070
           4       0.77      0.82      0.79     18276

    accuracy                           0.66     37500
   macro avg       0.59      0.59      0.59     37500
weighted avg       0.66      0.66      0.66     37500


Confusion Matrix:
 [[ 1962   472   165    32    20]
 [  692   956   597   115    30]
 [  195   655  1986  1042   235]
 [   32    65   739  5031  4203]
 [   19    22   133  3114 14988]]
