In [2]:
import pandas as pd
import numpy as np
import gensim
import warnings
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
dataset = pd.read_csv("IMDB_dataset_preprocessed.csv")
reviews = [w for w in dataset["review"]]
tokenized_words = [gensim.utils.simple_preprocess(w) for w in reviews]

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_words)

In [5]:
tokenizer.word_index

{'movie': 1,
 'film': 2,
 'one': 3,
 'like': 4,
 'time': 5,
 'good': 6,
 'character': 7,
 'story': 8,
 'even': 9,
 'get': 10,
 'would': 11,
 'make': 12,
 'see': 13,
 'really': 14,
 'well': 15,
 'scene': 16,
 'much': 17,
 'bad': 18,
 'people': 19,
 'great': 20,
 'also': 21,
 'first': 22,
 'show': 23,
 'way': 24,
 'thing': 25,
 'made': 26,
 'life': 27,
 'could': 28,
 'think': 29,
 'go': 30,
 'know': 31,
 'watch': 32,
 'love': 33,
 'plot': 34,
 'actor': 35,
 'two': 36,
 'many': 37,
 'seen': 38,
 'year': 39,
 'say': 40,
 'end': 41,
 'never': 42,
 'acting': 43,
 'look': 44,
 'best': 45,
 'little': 46,
 'ever': 47,
 'man': 48,
 'better': 49,
 'take': 50,
 'come': 51,
 'work': 52,
 'still': 53,
 'part': 54,
 'something': 55,
 'director': 56,
 'find': 57,
 'want': 58,
 'back': 59,
 'give': 60,
 'lot': 61,
 'real': 62,
 'guy': 63,
 'watching': 64,
 'performance': 65,
 'woman': 66,
 'play': 67,
 'old': 68,
 'funny': 69,
 'though': 70,
 'another': 71,
 'actually': 72,
 'role': 73,
 'nothing': 74,

In [6]:
def vocab_embedding(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1

    embedding_matrix_vocab = np.zeros((vocab_size,
                                       embedding_dim))
  
    with open(filepath, encoding = "utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(vector, dtype = np.float32)[:embedding_dim]
  
    return embedding_matrix_vocab

In [7]:
embedding_dim = 50
embedding_matrix_vocab = vocab_embedding('glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

In [8]:
vectors = []
for i in range(0, len(tokenized_words)):
    article_vector = []
    for j in range(0, len(tokenized_words[i])):
        word = tokenized_words[i][j]
        word_index = tokenizer.word_index[word]
        word_vector = embedding_matrix_vocab[word_index]
        article_vector.append(word_vector)
    vectors.append(article_vector)

In [9]:
# saving the vectors to disk
pickle_out = open("glove_vectors_preprocessed.pickle", "wb")
pickle.dump(vectors, pickle_out)
pickle_out.close()