In [1]:
import os
import urllib
import zipfile
import nltk
import numpy as np
import tensorflow as tf

# 1.Simplied method:
 * buil vocabu from pre-trained embedding instead of corpus
 * don't distinct the Unknow words, all map to single UNK
     * embedding_table[0] = PAD;  embedding_table[-1] = UNK;
 * Uncased embedding
 * limit the vocabulary size
 * tf.embedding_lookup
 * Data structure
     * word2idx :  dictionary for mapping words to their index token - used for converting a sequence of words to sequence of integers for embedding lookup
     * idx2word : a list of words in order - used for decoding an integer sequence to words
     * weights : a matrice of size VOCAB_LENGTH x EMBEDDING_DIMESNION containing the vectors for each word
https://www.damienpontifex.com/2017/10/27/using-pre-trained-glove-embeddings-in-tensorflow/

In [47]:
# Available dimensions for 6B data is 50, 100, 200, 300
EMBEDDING_DIMENSION=50 
data_directory = '../data/glove'

if not os.path.isdir(data_directory):
    os.mkdir(data_directory)

glove_weights_file_path = os.path.join(data_directory, 'glove.6B.{}d.txt'.format(MBEDDING_DIMENSION))

# if not glove files, download if
if not os.path.isfile(glove_weights_file_path):
    # Glove embedding weights can be downloaded from https://nlp.stanford.edu/projects/glove/
    glove_fallback_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
    local_zip_file_path = os.path.join(data_directory, os.path.basename(glove_fallback_url))
    if not os.path.isfile(local_zip_file_path):
        print('Retreiving glove weights from {}'.format(fallback_url))
        urllib.request.urlretrieve(glove_fallback_url, local_zip_file_path)
    with zipfile.ZipFile(local_zip_file_path, 'r') as z:
        print('Extracting glove weights from {}'.format(local_zip_file_path))
        z.extractall(path=data_directory)

In [79]:
# add 'PAD' the only uppercase word
PAD_TOKEN = 0

# dict so we can lookup indices for tokenising our text later from string to sequence of integers weights = []
word2idx = { 'PAD': PAD_TOKEN } 
weights = []
idx2word = []


with open(glove_weights_file_path, 'r') as file:     
    for index, line in enumerate(file): 
        values = line.split()
        # Word and weights separated by space 
        word = values[0]
        # Word is first symbol on each line 
        word_weights = np.asarray(values[1:], dtype=np.float32) 
        # Remainder of line is weights for word 
        word2idx[word] = index + 1 
        # Remainder of line is weights for word 
        weights.append(word_weights)
        # update the idx2word
        idx2word.append(word)
        
        if index + 1 == 40000:
            # Limit vocabulary to top 40k terms
            break
            
# Insert the PAD weights at index 0 now we know the embedding dimension
weights.insert(0, np.random.randn(EMBEDDING_DIMENSION))
idx2word.insert(0,'PAD')

# Append unknown and pad to end of vocab and initialize as random
UNKNOWN_TOKEN=len(weights) 
word2idx['UNK'] = UNKNOWN_TOKEN 
weights.append(np.random.randn(EMBEDDING_DIMENSION))
idx2word.append('UNK')
# Construct our final vocab
weights = np.asarray(weights, dtype=np.float32)

VOCAB_SIZE=weights.shape[0]

In [80]:
# Embeddings in TensorFlow
features = {}
features['word_indices'] = nltk.word_tokenize('hello world') # ['hello', 'world']
features['word_indices'] = [word2idx.get(word, UNKNOWN_TOKEN) for word in features['word_indices']]
features

{'word_indices': [13076, 86]}

In [82]:
tf.reset_default_graph()
glove_weights_initializer = tf.constant_initializer(weights)
embedding_weights = tf.get_variable(
    name='embedding_weights', 
    shape=(VOCAB_SIZE, EMBEDDING_DIMENSION), 
    initializer=glove_weights_initializer,
    trainable=False)
embedding = tf.nn.embedding_lookup(embedding_weights, features['word_indices'])
init_op = tf.initialize_all_variables()
with tf.Session() as sess:
    sess.run(init_op)
    print(sess.run(embedding))

[[-0.38497   0.80092   0.064106 -0.28355  -0.026759 -0.34532  -0.64253
  -0.11729  -0.33257   0.55243  -0.087813  0.9035    0.47102   0.56657
   0.6985   -0.35229  -0.86542   0.90573   0.03576  -0.071705 -0.12327
   0.54923   0.47005   0.35572   1.2611   -0.67581  -0.94983   0.68666
   0.3871   -1.3492    0.63512   0.46416  -0.48814   0.83827  -0.9246
  -0.33722   0.53741  -1.0616   -0.081403 -0.67111   0.30923  -0.3923
  -0.55002  -0.68827   0.58049  -0.11626   0.013139 -0.57654   0.048833
   0.67204 ]
 [-0.41486   0.71848  -0.3045    0.87445   0.22441  -0.56488  -0.37566
  -0.44801   0.61347  -0.11359   0.74556  -0.10598  -1.1882    0.50974
   1.3511    0.069851  0.73314   0.26773  -1.1787   -0.148     0.039853
   0.033107 -0.27406   0.25125   0.41507  -1.6188   -0.81778  -0.73892
  -0.28997   0.57277   3.4719    0.73817  -0.044495 -0.15119  -0.93503
  -0.13152  -0.28562   0.76327  -0.83332  -0.6793   -0.39099  -0.64466
   1.0044   -0.2051    0.46799   0.99314  -0.16221  -0.46022  -0

In [87]:
# check idx2word
print(idx2word[13076], idx2word[86])

hello world


# 2rd Method

In [None]:
https://dashayushman.github.io/tutorials/2017/08/19/neural-language-model.html
https://medium.com/@TalPerry/getting-text-into-tensorflow-with-the-dataset-api-ffb832c8bec6
    https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer
https://machinelearnings.co/tensorflow-text-classification-615198df9231