In [1]:
import os
import pandas as pd
import numpy as np
dataFile = pd.read_csv("Data File/tripadvisor_hotel_reviews.csv")

#dataFile['Rating'] = dataFile.Rating.astype('category')
#print(dataFile.dtypes)

print("Data read and has shape", dataFile.shape)
print(dataFile)

Data read and has shape (20491, 2)
                                                  Review  Rating
0      nice hotel expensive parking got good deal sta...       4
1      ok nothing special charge diamond member hilto...       2
2      nice rooms not 4* experience hotel monaco seat...       3
3      unique, great stay, wonderful time hotel monac...       5
4      great stay great stay, went seahawk game aweso...       5
...                                                  ...     ...
20486  best kept secret 3rd time staying charm, not 5...       5
20487  great location price view hotel great quick pl...       4
20488  ok just looks nice modern outside, desk staff ...       2
20489  hotel theft ruined vacation hotel opened sept ...       1
20490  people talking, ca n't believe excellent ratin...       2

[20491 rows x 2 columns]


In [2]:
no_reviews = 20491    # no of reviews that will be read from file.
max_review_length = 500 # no of words per review.  reviews will be  truncated or padded to be of this length.
max_words = 52212        # this is the size of the index (i.e. most common top words that will be used as features)
# note code assumes there are enough words in reviews.
embedding_dim = 100     # length of embedding based on Glove
validation_prop = 0.2   # prop of data for validation set
no_epochs =   10         # No of training cycles for the networks
batch_size = 64        # batch size for training

training_samples = 12000
validation_samples = 7000

In [3]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
# Use the tokenizer to code the reviews

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

reviews = dataFile["Review"].values
ratings = dataFile["Rating"].values


tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
word_index = tokenizer.word_index

dataFile = pad_sequences(sequences, maxlen=max_review_length)
#print(f'Found {len(word_index)} unique tokens')

labels = to_categorical(np.asarray(ratings-1))
x_test = dataFile[19000:]
y_test = labels[19000:]
testExamples = len(labels)-19000
x_train, x_val, y_train, y_val= train_test_split(dataFile[:19000], labels[:19000], test_size=0.2, random_state=42)




In [4]:
glove_dir = ".\\Glove\\glove.6B"
embeddings_index = {}

f = open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

print('no of words in glove embeddings =', len(embeddings_index))

no of words in glove embeddings = 400000


In [5]:
#look for word embeddings

embedding_matrix = np.zeros((max_words, embedding_dim))

for word,i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


print("shape of embeddings matrix is:",  embedding_matrix.shape)

# print some entries

for word,i in word_index.items():
    if i > 10: break
    print(f'{i}:{word}\t--> { embedding_matrix[i, 0:6]}')


shape of embeddings matrix is: (52212, 100)
1:hotel	--> [ 0.43044001 -0.71715999  0.13989     0.59311002 -0.16727     0.56128001]
2:room	--> [-0.024843    0.47766     0.32437    -0.054239   -0.47622001  1.10430002]
3:not	--> [-0.19103999  0.17601     0.36919999 -0.50322998 -0.47560999  0.15798   ]
4:great	--> [-0.013786    0.38216001  0.53236002  0.15261    -0.29694    -0.20558   ]
5:n't	--> [ 0.15730999  0.3953      0.63586003 -1.09749997 -0.95767999 -0.013841  ]
6:good	--> [-0.030769    0.11993     0.53908998 -0.43696001 -0.73936999 -0.15345   ]
7:staff	--> [-0.61250001 -0.29506999 -0.28917    -0.36431    -0.39695001  0.097624  ]
8:stay	--> [-0.41615999 -0.26538     0.21720999 -0.26014999 -0.18043999  0.38745001]
9:did	--> [ 0.30449    -0.19628     0.20225    -0.61686999 -0.68484002 -0.11887   ]
10:just	--> [ 0.075026    0.39324999  0.90314001 -0.30451    -0.32767999  0.59630001]


In [6]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, LSTM
from keras import layers

network_G = Sequential()
network_G.add(Embedding(len(word_index)+1, embedding_dim, weights=[embedding_matrix], input_length=max_review_length, trainable=False))
network_G.add(LSTM(64))
network_G.add(Dense(5, activation='softmax'))
network_G.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          5221200   
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 5)                 325       
                                                                 
Total params: 5,263,765
Trainable params: 42,565
Non-trainable params: 5,221,200
_________________________________________________________________


In [7]:
network_G.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'] )

hist_g = network_G.fit(x_train,y_train, epochs=no_epochs, batch_size=batch_size, validation_data= (x_val,y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
res = network_G.evaluate(x_test, y_test, steps=testExamples, verbose=1)
print('Accuracy on test set: %.3f' % res[1])

Accuracy on test set: 0.641
