# Trip Advisor model

## Steps

1. Import Trip Advisor data
2. Tokenize the data (create a word index that represents words as numbers)
3. Use an oov token to include words not seen before
4. Pad the sentences to have similar length


In [1]:
import pandas as pd

In [2]:
# Important Variables
vocab_size = 10000
trunc_type ="post"
padding_type = "post"
oov_tok = "<OOV>"
embedding_dim = 16

In [3]:
df = pd.read_csv('tripadvisor_hotel_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
sentences = list(df['Review'])
labels = list(df['Rating'])

X_train, X_test, y_train, y_test = train_test_split(sentences, labels, random_state=0)

print(X_train[:5])
print(y_train[-5:])

['fabulous hotel mum just returned 4 night stay hotel 1898 fabulous, recently decorated rooms immaculate decor fantastic really beautiful hotel, location perfect nicer end la ramblas need close treat return tranquility hotel busy day sight seeing.the staff attentive polite spoke perfect english, price drinks bar expect little hard swallow having gorgeous course meal local resturant price gin tonics.although roof pool undergoing refurbishment stay basement pool surrounding facilities adequate,  ', "romantic international ambience spent honeymoon melia caribe 23-30. plane landed torrential downpour soaked skin steps plane, rained 7 days just hot gorgeous, truly loved resort food people, management helpful needed courteous friendly, nightly shows fun casino, pools incredible beach beautiful, just short stroll resort swim deserted stretches beach wanted, took outback tour must-do tourists, islanders live visit mountains macou beach enjoy lunch siesta hammocks, buy souviniers tour rum 2/bot

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# Tokenize the words (bag of words) with an oov token
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [8]:
import numpy as np

In [9]:
sequences_training = tokenizer.texts_to_sequences(X_train)
max_length = int(np.median([len(x) for x in X_train]))
padded_training = pad_sequences(
    sequences_training, 
    padding=padding_type, 
    truncating=trunc_type,
    maxlen=max_length
)

print(padded_training[0])
print(padded_training.shape)

[ 330    2 3534   11  196   67   18    9    2 5022  330  559  446   13
 1117  429   97   28   53    2   15   99  731  197  257  616  107   90
  939  157 5667    2  328   21 1312  785   29    8  662  598  467   99
  152   73  123   43  221   35  209    1  184  497  383  396  308 2076
   73 4847    1 3082  893   30 4341 3079    9 2090   30 1586  362  549
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [10]:
sequences_test = tokenizer.texts_to_sequences(X_test)
padded_test = pad_sequences(
    sequences_test, 
    padding=padding_type, 
    truncating=trunc_type,
    maxlen=max_length
)

print(padded_test[0])
print(padded_test.shape)

[   1   65  218  216  432  767    2   16    2  436  107  239  289  387
    7   13   35 2864   67   13 4626 1187 1381 1148    1  129   12  244
 3959  173 5356    8   28   12   41 1477    3   10    4  538   25   33
    1  352   12   35  320   28  391  167 4696 5018  434  103   57    8
 1477   12    8  705 6889  517   13    6  112  496 1182   11 1528  341
    3  642  194 1263    6    1 1148    1  806   21  512 3186   30   57
   20 1157    4   25   39   25 5110  935 2864  248 1208  258  111  225
    2   31   44   42  723   25 5870 1564   65    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [11]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
padded_training = np.array(padded_training)
y_train = np.array(y_train)
padded_test = np.array(padded_test)
y_test = np.array(y_test)

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation = 'softmax')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 531, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 125       
Total params: 160,533
Trainable params: 160,533
Non-trainable params: 0
_________________________________________________________________


In [14]:
num_epochs = 30
history = model.fit(
    padded_training, y_train,
    epochs= num_epochs,
    validation_data = (padded_test, y_test),
    verbose=2
)

Epoch 1/30
481/481 - 2s - loss: 5.7033 - accuracy: 0.1217 - val_loss: 5.6985 - val_accuracy: 0.0472
Epoch 2/30
481/481 - 1s - loss: 5.7033 - accuracy: 0.1158 - val_loss: 5.6985 - val_accuracy: 0.2040
Epoch 3/30
481/481 - 1s - loss: 5.7033 - accuracy: 0.1104 - val_loss: 5.6985 - val_accuracy: 0.0951
Epoch 4/30
481/481 - 1s - loss: 5.7033 - accuracy: 0.1117 - val_loss: 5.6985 - val_accuracy: 0.0824
Epoch 5/30
481/481 - 1s - loss: 5.7033 - accuracy: 0.1095 - val_loss: 5.6985 - val_accuracy: 0.1052
Epoch 6/30
481/481 - 1s - loss: 5.7033 - accuracy: 0.1128 - val_loss: 5.6985 - val_accuracy: 0.1893
Epoch 7/30
481/481 - 1s - loss: 5.7033 - accuracy: 0.1129 - val_loss: 5.6985 - val_accuracy: 0.0519
Epoch 8/30
481/481 - 1s - loss: 5.7033 - accuracy: 0.1085 - val_loss: 5.6985 - val_accuracy: 0.1290
Epoch 9/30
481/481 - 1s - loss: 5.7033 - accuracy: 0.1067 - val_loss: 5.6985 - val_accuracy: 0.0902
Epoch 10/30
481/481 - 1s - loss: 5.7033 - accuracy: 0.1100 - val_loss: 5.6985 - val_accuracy: 0.0262