# Trip Advisor model

## Steps

1. Import Trip Advisor data
2. Tokenize the data (create a word index that represents words as numbers)
3. Use an oov token to include words not seen before
4. Pad the sentences to have similar length


In [1]:
import pandas as pd

In [2]:
# Important Variables
vocab_size = 10000
trunc_type ="post"
padding_type = "post"
oov_tok = "<OOV>"
embedding_dim = 16

In [3]:
df = pd.read_csv('to_model.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Review,Rating,Length,weight
0,8270,"okay steal, unfortunate items cosmetics taken ...",1,443,5
1,14380,excellent value super value hotel right kurfue...,5,337,1
2,1,ok nothing special charge diamond member hilto...,2,1689,4
3,6025,"worst vacation, fiance stayed 7 nights bbp jun...",1,3533,5
4,2802,"beware, not room went paradisus 6 days wedding...",1,411,5


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
sentences = list(df['Review'])
labels = list(df['Rating'])

X_train, X_test, y_train, y_test = train_test_split(sentences, labels, random_state=0)

print(X_train[:5])
print(y_train[-5:])

["experience melia caribe girlfriend recently went melia caribe sept 9th 14th, reading numerous reviews watching weather channel prior leaving say bit nervous upcoming trip, hopefully review answer questions headed resort helpful, try cover informed be.first booked trip hotel website solmelia.com, sol melia resorts actually offers low-price online guarantee booking easy not problem, thing need aware booking hotel website not charge credit card make reservation charge card check hotel, based reviews tripadvisor.com decided royal service master suite.we flew atlanta dominican republic delta airlines, offer flights twice week stay typically days.you arrive dominican republic airport walk plane terminal, wait line enter terminal picture flight wall case want buy, clear photo op open wallet purse pay 10 person tourist entry card, completely meaningless not visa dr poor country charge dollars just in.once bags swarmed baggage handlers eager grab bags awaiting taxi, word advice carry bags, sh

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# Tokenize the words (bag of words) with an oov token
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [8]:
import numpy as np

In [9]:
sequences_training = tokenizer.texts_to_sequences(X_train)
max_length = int(np.median([len(x) for x in X_train]))
padded_training = pad_sequences(
    sequences_training, 
    padding=padding_type, 
    truncating=trunc_type,
    maxlen=max_length
)

print(padded_training[0])
print(padded_training.shape)

[  79 1108  788 1139  614   52 1108  788 2632 2061 2823  486 1081   76
 1258  514 2307  896  531   78   74 1970 7846   51 2687  377 1302  797
 1778   25   49  142 2065  883 3181 2327   65   51    2  362    1 1117
 2488 1108  363  246  916  639   66 1132 2927  320  183    4   90  134
  103  997  320    2  362    4  289  624  331   67  301  289  331   55
    2  580   76  773 1117  361  635   17 3277  206  120 2244 5775  346
 1094 5176 3193  393 1484  406  105   10 2592   82  623  704  346 1094
   99   40 1492 1951  198  322 1522 1951 1249  359  422  613   62  520
  712 2587 9356  135 3592 4250  148   85  243  441 2140  331  579    1
    4 3480 1060  308  485  289  636   12  169 3148  447    1 2073    1
 2116 1740  447 4794  286 1022  746 1271  447  252   40  286  342    5
  124 1238  447  267 1116 3031  151 1951    5 1759  272   66  286 1724
 1403   25  219 2592  381  517  716 1108  788 1794    5  630 4298 1883
   95  876 2073    1  301   97  635   17  312  146   37    8  635   17
  312 

In [10]:
sequences_test = tokenizer.texts_to_sequences(X_test)
padded_test = pad_sequences(
    sequences_test, 
    padding=padding_type, 
    truncating=trunc_type,
    maxlen=max_length
)

print(padded_test[0])
print(padded_test.shape)

[8489  737  311  347  503  116  546  116   60  117  255    1   18 1619
 3823  851 3996   65  511  242 4362   76 1970  179    1   60  117  190
 6176    1   60  117 1967 1174 5870  619   51   65 1130 1033   32  732
 8937  377  111 4709  704   25  105  162 2253   95   42 1381    8    9
    4   62 3088 3975  185    4  952 1061  703    3 4255  172  323  551
  217  739  253  172  382 3834 1133  551  491   17 1448 3059   22   50
  255   10  511   48   22   33  686  223    1  211 5845   35  885 2999
 1930 2136 1336    7   77  135 3300 1692 2034  126   77 1451  623   60
 1599    4  108    1  596 1589  800 1856    1 1807  107 1890 2749 1881
 3175  138 2205 2057  920 1910 1380  174    1  181  583   17 2162  699
    9   47  641  198   41    8   38   32  248  105   17 2592  582 1381
  616  687  571  579  177  412  968 1180 5480  291  644  378  164   19
  450  582   41  121    7   17    4 6340 1161 1815 1404  497    1    4
 1183   96   13   24 1906 2203 1120  742    2   25    3 1455  426   39
   59 

In [11]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
padded_training = np.array(padded_training)
y_train = np.array(y_train)
padded_test = np.array(padded_test)
y_test = np.array(y_test)

In [12]:

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(6, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 563, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                544       
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 198       
Total params: 160,742
Trainable params: 160,742
Non-trainable params: 0
_________________________________________________________________


In [14]:
num_epochs = 50
history = model.fit(
    padded_training, y_train,
    epochs= num_epochs,
    validation_data = (padded_test, y_test),
    verbose=2
)

Epoch 1/50
145/145 - 0s - loss: 1.6762 - accuracy: 0.2946 - val_loss: 1.5730 - val_accuracy: 0.3188
Epoch 2/50
145/145 - 0s - loss: 1.5807 - accuracy: 0.3078 - val_loss: 1.5456 - val_accuracy: 0.3188
Epoch 3/50
145/145 - 0s - loss: 1.5676 - accuracy: 0.3004 - val_loss: 1.5354 - val_accuracy: 0.3188
Epoch 4/50
145/145 - 0s - loss: 1.5542 - accuracy: 0.2950 - val_loss: 1.5278 - val_accuracy: 0.3201
Epoch 5/50
145/145 - 0s - loss: 1.5357 - accuracy: 0.3100 - val_loss: 1.4933 - val_accuracy: 0.3240
Epoch 6/50
145/145 - 0s - loss: 1.4747 - accuracy: 0.3356 - val_loss: 1.4237 - val_accuracy: 0.3494
Epoch 7/50
145/145 - 0s - loss: 1.3844 - accuracy: 0.3672 - val_loss: 1.3344 - val_accuracy: 0.3936
Epoch 8/50
145/145 - 0s - loss: 1.2870 - accuracy: 0.4093 - val_loss: 1.2503 - val_accuracy: 0.4105
Epoch 9/50
145/145 - 0s - loss: 1.2091 - accuracy: 0.4282 - val_loss: 1.1950 - val_accuracy: 0.4392
Epoch 10/50
145/145 - 0s - loss: 1.1564 - accuracy: 0.4562 - val_loss: 1.1662 - val_accuracy: 0.4502