In [29]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Sequential, Model
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)


df = pd.read_csv('./data/unprocessed/tripadvisor_hotel_reviews.csv')

# Tokenize the reviews
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Review'])
vocab_size = len(tokenizer.word_index) + 1

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['Review'])

# Pad sequences to ensure uniform length
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')


padded_sequences = padded_sequences[:, :100]  # limit the size to 100 tokens for now

max_seq_length = 100

print(padded_sequences.shape)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['Rating'], test_size=0.2, random_state=22)

# make scores 0 to 4 instead of 1 to 5
y_train -= 1
y_test -= 1

print(X_train[:2])

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), 
             Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    

class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions



embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = Input(shape=(max_seq_length,))
embedding_layer = TokenAndPositionEmbedding(max_seq_length, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(5, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

history = model.fit(X_train, y_train, 
                    batch_size=10, epochs=5, 
                    validation_data=(X_test, y_test)
                   )



(20491, 100)
[[  217   215    52   228   412  1612    11    26     8   114    10   403
   4327  1497   728    10    54  2047  8400     2    27    11  3632     2
     66   722   256   688   401     6  3514   297   926   960  1083   100
   2207    16    23  4647  3040    10   650  4507    99    87  5425    99
    271  2113 17985  3919  4327    34 18387  1984  1591   462   531  1590
    390   351    11 51415  4565    39   212  8310   950 51416   960   470
    623    34    62  4776  2175   934 15879   676  1224  4776    10  1022
   2257   471    13   720  2328   139    27   208   218     2    51     3
      6   271   150  2315]
 [ 2622  2674 16577    10    36    17     8 16577   710   313   859   180
   3599   240  4830  1468    24  5941    39  3999  2515   524   260  1024
     60  1094  3112   489     1    53     2   227   318    13  2509  2302
     36    79     9     5 17646  4237    26   706    22   191    20     7
     27    31   198     5  1086    21   117   229     5    54 38957   35

In [30]:
results = model.evaluate(X_test, y_test, verbose=2)

for name, value in zip(model.metrics_names, results):
    print("%s: %.3f" % (name, value))

129/129 - 1s - loss: 1.6199 - accuracy: 0.5753 - 524ms/epoch - 4ms/step
loss: 1.620
accuracy: 0.575


The 7th, 8th and 9th mock reviews were taken from recent trip advisor reviews (not in the dataset), and ratings were 2, 1, 3 respectively.

The following block of code will predict the ratings in the mock_reviews and print out a list of the predicted ratings.

In [38]:
mock_reviews = pd.Series(
    [ 
        "The staff was friendly!",
        "The service was horrible!",
        "The hotel is dirty!",
        "The location is convenient.",
        "The elevator was broken",
        "exciting views.",
        "I travel for work in wind energy field. This is the closest motel. Worth driving 20 miles further for a clean comfortable room. This is the 3rd stay. Due to having to be at the windfarm early, we stayed. This room was in the older building. Uncomfortable beds, dingy bedding. I had to go buy Lysol to spray. If you have allergies, stay away. Shower curtain has mold all along bottom. I guess they don't know what bleach is. I don't usually post negative reviews, but felt need to. Also, overcharged. 79.00 for 30.00 room. The newer building was not much better.",
        "Do not eat at the buffet. We went on NYE there was barely any food and what was there was old. It consisted of ham, a couple pieces of friend chicken, white rice , and corn on cob that I couldn’t even eat it tasted so bad. I scraped the bottom of the lettuce bowl for a salad and got one chicken breast on NYE. We went back the next morning for breakfast(as this is the only place to eat on site) and never got our food after waiting for 30 minutes with only 4 tables full in the place. All the other tables waited longer and got cold food and runny pancakes. I saw two tables walk out during our wait. I watched a burrito sit in the window the entire time we were there. The waitress told us we could refill our own drinks as she was so busy. There were 3 people working at the time and again 4 tables only of customers.",
        "We stayed for a 3 day weekend in May. Hotel is decent for the area and the price. People were friendly. Big refrigerator, microwave and coffee maker in the room. Bed was comfortable and the shower was good. \
        No real closet - just a rod in the room to hang things on. No safe so I had to take my laptop with me everywhere I went. Do not like just leaving it in the room. \
        Casino hours very weird. They open table games at 4pm. We were there on Thursday night around 8:30pm. Only 2 tables open - blackjack and 3 card poker. \
        My husband arrived around 9:30 to join me. At 10pm the dealer tells us they are closing the tables since it is a week night. So early! On Friday we didn't\
        get there until around 10:15. The night before the pit boss told us the tables stay open on Friday and Saturday until sometime between midnight and 2am\
         depending on how busy they are so we thought we would be able to play for awhile. At 10:45pm they shut the tables down. So weird! Also - sign on the wall\
           says if you win a jackpot on the slot machines that is $1200 or more you have to have your drivers license AND social security card. Who carries their social security card with them!",
        "The best hotel I have ever seen!"
    ]
)

x_mock = pad_sequences(tokenizer.texts_to_sequences(mock_reviews), maxlen=max_seq_length, padding='post')
result_list = model.predict(x_mock)
numerical_result = []
for result in result_list:
    rating = 0
    for i in range(5):
        rating += result[i] * (i + 1)
    numerical_result.append(rating)
print(numerical_result)



[3.667477898299694, 1.007367304426225, 1.0028702747229659, 3.0528145637363195, 1.0128547379540578, 3.849524668585218, 1.9782704779809137, 1.9600387723257882, 2.015419053393657, 4.127005843445659]
