In [1]:
import pandas as pd
import numpy as np
import gensim
import warnings
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
dataset = pd.read_csv("Hotel_dataset.csv")
reviews = [w for w in dataset["Review"]]
tokenized_words = [gensim.utils.simple_preprocess(w) for w in reviews]

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_words)

In [4]:
tokenizer.word_index

{'hotel': 1,
 'room': 2,
 'not': 3,
 'great': 4,
 'good': 5,
 'staff': 6,
 'stay': 7,
 'did': 8,
 'just': 9,
 'nice': 10,
 'rooms': 11,
 'no': 12,
 'location': 13,
 'stayed': 14,
 'service': 15,
 'night': 16,
 'time': 17,
 'beach': 18,
 'day': 19,
 'breakfast': 20,
 'clean': 21,
 'food': 22,
 'like': 23,
 'resort': 24,
 'place': 25,
 'really': 26,
 'the': 27,
 'pool': 28,
 'friendly': 29,
 'people': 30,
 'small': 31,
 'little': 32,
 'walk': 33,
 'got': 34,
 'excellent': 35,
 'area': 36,
 'best': 37,
 'helpful': 38,
 'bar': 39,
 'restaurant': 40,
 'restaurants': 41,
 'bathroom': 42,
 'water': 43,
 'trip': 44,
 'bed': 45,
 'recommend': 46,
 'beautiful': 47,
 'view': 48,
 'floor': 49,
 'went': 50,
 'comfortable': 51,
 'desk': 52,
 'nights': 53,
 'check': 54,
 'right': 55,
 'want': 56,
 'way': 57,
 'free': 58,
 'hotels': 59,
 'better': 60,
 'city': 61,
 'away': 62,
 'wonderful': 63,
 'make': 64,
 'booked': 65,
 'price': 66,
 'bit': 67,
 'reviews': 68,
 'large': 69,
 'street': 70,
 'minutes

In [5]:
def vocab_embedding(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1

    embedding_matrix_vocab = np.zeros((vocab_size,
                                       embedding_dim))
  
    with open(filepath, encoding = "utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(vector, dtype = np.float32)[:embedding_dim]
  
    return embedding_matrix_vocab

In [6]:
embedding_dim = 50
embedding_matrix_vocab = vocab_embedding('glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

In [7]:
vectors = []
for i in range(0, len(tokenized_words)):
    review_vector = []
    for j in range(0, len(tokenized_words[i])):
        word = tokenized_words[i][j]
        word_index = tokenizer.word_index[word]
        word_vector = embedding_matrix_vocab[word_index]
        review_vector.append(word_vector)
    vectors.append(review_vector)

In [8]:
# saving the vectors to disk
pickle_out = open("glove_vectors.pickle", "wb")
pickle.dump(vectors, pickle_out)
pickle_out.close()