In [19]:
import numpy as np

In [5]:
import nltk
from nltk.tokenize import word_tokenize

# Download the tokenizer if you haven't already
nltk.download('punkt')
nltk.download('punkt_tab')

# Example sentence
sentence = "Did someone say, \"Oriental for $60\"?  It is a great product for the".lower()

# Tokenizing the sentence
tokens = word_tokenize(sentence)

print(tokens)


['did', 'someone', 'say', ',', '``', 'oriental', 'for', '$', '60', "''", '?', 'it', 'is', 'a', 'great', 'product', 'for', 'the']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\panwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\panwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [20]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r',encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [21]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('Embeddings/glove.6B.50d.txt')

In [39]:
from joblib import dump

dump(word_to_index,"Embeddings/word_to_index.joblib")

['word_to_index.joblib']

In [41]:
dump(index_to_word,"Embeddings/index_to_word.joblib")
dump(word_to_vec_map,"Embeddings/word_to_vec.joblib")

['word_to_vec.joblib']

In [42]:
vocab_size = len(word_to_index) + 1              # adding 1 to fit Keras embedding (requirement)
any_word = next(iter(word_to_vec_map.keys()))
emb_dim = word_to_vec_map[any_word].shape[0]    # define dimensionality of your GloVe word vectors (= 50)
  
### START CODE HERE ###
# Step 1
# Initialize the embedding matrix as a numpy array of zeros.
# See instructions above to choose the correct shape.
emb_matrix = np.zeros((vocab_size,emb_dim))

# Step 2
# Set each row "idx" of the embedding matrix to be 
# the word vector representation of the idx'th word of the vocabulary
for word, idx in word_to_index.items():
    emb_matrix[idx, :] = word_to_vec_map[word]

In [43]:
emb_matrix.shape

(400001, 50)

In [46]:
emb_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [47]:
dump(emb_matrix,"Embeddings/Embedding Matrix.joblib")

['Embedding Matrix.joblib']