In [1]:
import pandas as pd
import numpy as np
import gensim
import warnings
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
dataset = pd.read_csv("IMDB_dataset_untouched.csv")
reviews = [w for w in dataset["review"]]
tokenized_words = [gensim.utils.simple_preprocess(w) for w in reviews]

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_words)

In [4]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'of': 3,
 'to': 4,
 'is': 5,
 'br': 6,
 'it': 7,
 'in': 8,
 'this': 9,
 'that': 10,
 'was': 11,
 'as': 12,
 'movie': 13,
 'for': 14,
 'with': 15,
 'but': 16,
 'film': 17,
 'you': 18,
 'on': 19,
 'not': 20,
 'he': 21,
 'are': 22,
 'his': 23,
 'have': 24,
 'one': 25,
 'be': 26,
 'all': 27,
 'at': 28,
 'they': 29,
 'by': 30,
 'an': 31,
 'who': 32,
 'so': 33,
 'from': 34,
 'like': 35,
 'there': 36,
 'or': 37,
 'just': 38,
 'her': 39,
 'out': 40,
 'about': 41,
 'if': 42,
 'has': 43,
 'what': 44,
 'some': 45,
 'good': 46,
 'can': 47,
 'when': 48,
 'more': 49,
 'very': 50,
 'she': 51,
 'up': 52,
 'no': 53,
 'time': 54,
 'my': 55,
 'even': 56,
 'would': 57,
 'which': 58,
 'only': 59,
 'story': 60,
 'really': 61,
 'see': 62,
 'their': 63,
 'had': 64,
 'me': 65,
 'well': 66,
 'we': 67,
 'were': 68,
 'than': 69,
 'much': 70,
 'bad': 71,
 'get': 72,
 'been': 73,
 'other': 74,
 'do': 75,
 'people': 76,
 'great': 77,
 'will': 78,
 'also': 79,
 'into': 80,
 'because': 81,
 'how

In [5]:
def vocab_embedding(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1

    embedding_matrix_vocab = np.zeros((vocab_size,
                                       embedding_dim))
  
    with open(filepath, encoding = "utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(vector, dtype = np.float32)[:embedding_dim]
  
    return embedding_matrix_vocab

In [6]:
embedding_dim = 50
embedding_matrix_vocab = vocab_embedding('glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

In [7]:
vectors = []
for i in range(0, len(tokenized_words)):
    article_vector = []
    for j in range(0, len(tokenized_words[i])):
        word = tokenized_words[i][j]
        word_index = tokenizer.word_index[word]
        word_vector = embedding_matrix_vocab[word_index]
        article_vector.append(word_vector)
    vectors.append(article_vector)

In [8]:
# saving the vectors to disk
pickle_out = open("glove_vectors_untouched.pickle", "wb")
pickle.dump(vectors, pickle_out)
pickle_out.close()