In [1]:
import pandas as pd
df = pd.read_excel('hotel_reviews.xlsx')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [2]:
# if rating is more than or equal to 4 then it returns 'pos'. If not, 'neg'
df['Rating'] = df['Rating'].apply(lambda c: 'pos' if c >=3 else 'neg')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,pos
1,ok nothing special charge diamond member hilto...,neg
2,nice rooms not 4* experience hotel monaco seat...,pos
3,"unique, great stay, wonderful time hotel monac...",pos
4,"great stay great stay, went seahawk game aweso...",pos


In [3]:
df.shape

(20491, 2)

In [4]:
labels = []
texts = []

for i in df['Rating']:
    if i == "neg":
        labels.append(0)
    else:
        labels.append(1)  

for i in df['Review']:
    texts.append(i)

In [5]:
maxlen = 100 # cuts off review after 100 words
training_samples = 2000 # Trains on 2000 samples
validation_samples = 10000 # Validates o 10000 samples
max_words = 10000 # Considers only the top 10000 words in the dataset

- Tokenize data

In [6]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts) #added the square brackets in texts
# takes a list of texts and returns a list of sequences
# each sequence is a list of integers corresponding to the words in the text
word_index = tokenizer.word_index                   
print("Found %s unique tokens." % len(word_index))



Found 52143 unique tokens.


- Pad the sequences 

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)

print("Shape of data tensor:", data.shape)
print("Shape of label tensor:", labels.shape)

Shape of data tensor: (20491, 100)
Shape of label tensor: (20491,)


- Split into trianing and validation set

In [8]:
indices = np.arange(data.shape[0]) # Splits data into training and validation set, 
# but shuffles it since samples are ordered: all negatives first, then all positive
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples] 
y_train = labels[:training_samples] 
x_val = data[training_samples:training_samples+validation_samples] 
y_val = labels[training_samples:training_samples+validation_samples] 

In [9]:
y_train.shape

(2000,)

In [11]:
import os
glove_dir = "/Users/punamichowdary/Coding/NLP/glove.6B.100d.txt"

embeddings_index = {}

f = open(os.path.join(glove_dir), encoding='utf-8') 
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

print("found %s word vectors." % len (embeddings_index))

found 400000 word vectors.


In [12]:
embedding_dim = 100 # GloVe contains 100-dimensional embedding vectors for 400.000 words

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word) 
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector # Words not found in the embedding index will all be zeros

## LSTM Model using pre-trained GloVe embeddings 

In [13]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import Constant

# load these pre-trained word embeddings into an Embedding layer
embedding_layer = Embedding(max_words,
                            embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=maxlen,
                            trainable=False)

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()


2022-12-28 17:54:46.735766: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,117,377
Trainable params: 117,377
Non-trainable params: 1,000,000
_________________________________________________________________


In [15]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [16]:
model.fit(x_train, y_train,
          batch_size=32,
          epochs=10,
          validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff62160cb20>

In [17]:
score, acc = model.evaluate(data, labels,
                            batch_size=32)
print('Test accuracy with RNN:', acc)

Test accuracy with RNN: 0.9049338698387146


In [18]:
y_preds_array = model.predict(data)



In [19]:
y_preds_array

array([[0.9903483 ],
       [0.97851807],
       [0.9943525 ],
       ...,
       [0.980575  ],
       [0.9859848 ],
       [0.97771835]], dtype=float32)

In [20]:
y_preds=[]
for i in y_preds_array:
    for j in i:
        y_preds.append(j)
        
y_preds        

[0.9903483,
 0.97851807,
 0.9943525,
 0.9928862,
 0.38162825,
 0.99444854,
 0.45010883,
 0.59254044,
 0.95864666,
 0.04321215,
 0.8544922,
 0.992659,
 0.9913464,
 0.06641059,
 0.9928019,
 0.99262345,
 0.02205378,
 0.5287397,
 0.80949265,
 0.92831826,
 0.99391216,
 0.22777903,
 0.9410094,
 0.9917811,
 0.93483514,
 0.97706276,
 0.988129,
 0.01690246,
 0.1777138,
 0.9941249,
 0.8424747,
 0.9942326,
 0.9928153,
 0.9581279,
 0.9933744,
 0.7559789,
 0.96111506,
 0.98140913,
 0.7365769,
 0.99624497,
 0.77961934,
 0.35469094,
 0.99695206,
 0.98899484,
 0.9695066,
 0.13064967,
 0.8385062,
 0.98407114,
 0.99802977,
 0.95800614,
 0.63551307,
 0.9916015,
 0.9878787,
 0.99523914,
 0.9845525,
 0.99565876,
 0.99640805,
 0.9956082,
 0.97244203,
 0.99658906,
 0.9902395,
 0.082269184,
 0.09942623,
 0.98970383,
 0.98864615,
 0.9866512,
 0.82312983,
 0.93063587,
 0.99660546,
 0.82124025,
 0.99695957,
 0.9913748,
 0.07207659,
 0.99234957,
 0.8838817,
 0.9943221,
 0.9926251,
 0.9343017,
 0.15514001,
 0.9802

In [21]:
predictions=[]
for i in y_preds:
    if i<=0.9:
        predictions.append('pos')
    else:
        predictions.append('neg')
        
predictions       

['neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',


In [22]:
given_labels = df['Rating'].tolist()
given_labels

['pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',
 'pos',


In [23]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(given_labels, predictions)
accuracy

0.329705724464399

In [24]:
df['y_preds']=pd.Series(y_preds)
df['Predictions']=pd.Series(predictions)
df.head()

Unnamed: 0,Review,Rating,y_preds,Predictions
0,nice hotel expensive parking got good deal sta...,pos,0.990348,neg
1,ok nothing special charge diamond member hilto...,neg,0.978518,neg
2,nice rooms not 4* experience hotel monaco seat...,pos,0.994353,neg
3,"unique, great stay, wonderful time hotel monac...",pos,0.992886,neg
4,"great stay great stay, went seahawk game aweso...",pos,0.381628,pos


In [25]:
df['y_preds'].astype(float)

0        0.990348
1        0.978518
2        0.994353
3        0.992886
4        0.381628
           ...   
20486    0.996736
20487    0.382758
20488    0.980575
20489    0.985985
20490    0.977718
Name: y_preds, Length: 20491, dtype: float64

In [26]:
df['Match'] = df.Predictions.eq(df.Rating)
df.head()

Unnamed: 0,Review,Rating,y_preds,Predictions,Match
0,nice hotel expensive parking got good deal sta...,pos,0.990348,neg,False
1,ok nothing special charge diamond member hilto...,neg,0.978518,neg,True
2,nice rooms not 4* experience hotel monaco seat...,pos,0.994353,neg,False
3,"unique, great stay, wonderful time hotel monac...",pos,0.992886,neg,False
4,"great stay great stay, went seahawk game aweso...",pos,0.381628,pos,True


In [27]:
df['Match'].value_counts()

False    13735
True      6756
Name: Match, dtype: int64

In [28]:
len(df['Match'])

20491

In [29]:
len((df['Match']).values==False)

20491

In [30]:
(len(df['Match']==True))/(len(df['Match']))

1.0

## Another way of doing the same thing

In [31]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length = maxlen)) 
model.add(Flatten()) 
model.add(Dense(32, activation = "relu"))
model.add(Dense(1, activation="sigmoid")) 
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense_1 (Dense)             (None, 32)                320032    
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False 

'''Setting this to False makes sure the Embedding layer is non-trainable when calling it.'''

'Setting this to False makes sure the Embedding layer is non-trainable when calling it. \nIf you were to set trainable = True, then it will allow the optimization algorithm to modify \nthe values of the word embeddings. Pretrained parts shouldn\'t be updated be during training, \nto avoid them forgetting what they already "know"'

In [33]:
model.compile(optimizer = "rmsprop", 
              loss = "binary_crossentropy", 
              metrics = ["acc"]) 
history = model.fit(x_train, y_train,
                   epochs = 10,
                   batch_size = 32,
                   validation_data = (x_val, y_val))
model.save_weights("pre_trained_glove_model.h5")        

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
labels = []
texts = []

for i in df['Rating']:
    if i == "neg":
        labels.append(0)
    else:
        labels.append(1)  

for i in df['Review']:
    texts.append(i)



In [35]:
sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels)

In [36]:
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)



[0.67576003074646, 0.8776535987854004]

In [37]:
model.metrics_names

['loss', 'acc']

In [38]:
# Accuracy = 87.47

## Training word embeddings on the fly 

In [42]:
labels = []
texts = []

for i in df['Rating']:
    if i == "neg":
        labels.append(0)
    else:
        labels.append(1)  

for i in df['Review']:
    texts.append(i)

In [43]:
data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)


In [44]:
rnnmodel = Sequential()
rnnmodel.add(Embedding(max_words, 128))
rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel.add(Dense(1, activation='sigmoid'))
rnnmodel.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print('Training the RNN')


Training the RNN


In [45]:
rnnmodel.fit(x_train, y_train,
          batch_size=32,
          epochs=3,
          validation_data=(x_val, y_val))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff60c91dc40>

In [46]:
score, acc = rnnmodel.evaluate(data, labels,
                            batch_size=32)
print('Test accuracy with RNN:', acc)

Test accuracy with RNN: 0.8519349694252014
