In [10]:
import pandas as pd
import numpy as np
import os
import json

In [3]:
def get_GloVe(directory):
    """
    Open Stanford's GloVe file with 100 dimensional embeddings
    
    :param directory: directory of the GloVe
    :type  directory: str
    :return: dictionary where the keys are the words, 
             and values are the 100d representation
    :rtype:  dict
    """

    glove_dir = directory

    # dictionary that maps words into 100d array
    embeddings_index = {}
    file = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))

    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    file.close()
    print('Found %s word vectors.' % len(embeddings_index))

    return embeddings_index

In [4]:
directory = '/Users/jinli/Projects/glove.6B'
word_vectors = get_GloVe(directory)

Found 400000 word vectors.


In [32]:
word_vectors['hi'].shape

(100,)

In [None]:
"""
My Plan:

1. Combine all the text together into one long file (one long string).
2. Lowercase all the words (one long string)
3. Tokenize the words. (list of words split by spaces.)
4. Split into two lists, one that holds the sentence, another that holds the predicted next word

"""
print('')

In [5]:
sample_text = """Use securing confined his shutters. Delightful as he it acceptance an solicitude discretion reasonably. Carriage we husbands advanced an perceive greatest. Totally dearest expense on demesne ye he. Curiosity excellent commanded in me. Unpleasing impression themselves to at assistance acceptance my or. On consider laughter civility offended oh. 

Kindness to he horrible reserved ye. Effect twenty indeed beyond for not had county. The use him without greatly can private. Increasing it unpleasant no of contrasted no continuing. Nothing colonel my no removed in weather. It dissimilar in up devonshire inhabiting. 

He do subjects prepared bachelor juvenile ye oh. He feelings removing informed he as ignorant we prepared. Evening do forming observe spirits is in. Country hearted be of justice sending. On so they as with room cold ye. Be call four my went mean. Celebrated if remarkably especially an. Going eat set she books found met aware. """

In [7]:
sample_text = sample_text.lower()

In [13]:
sample_text
# with open('thoughts.txt') as f:
#     data = json.load(f)

'use securing confined his shutters. delightful as he it acceptance an solicitude discretion reasonably. carriage we husbands advanced an perceive greatest. totally dearest expense on demesne ye he. curiosity excellent commanded in me. unpleasing impression themselves to at assistance acceptance my or. on consider laughter civility offended oh. \n\nkindness to he horrible reserved ye. effect twenty indeed beyond for not had county. the use him without greatly can private. increasing it unpleasant no of contrasted no continuing. nothing colonel my no removed in weather. it dissimilar in up devonshire inhabiting. \n\nhe do subjects prepared bachelor juvenile ye oh. he feelings removing informed he as ignorant we prepared. evening do forming observe spirits is in. country hearted be of justice sending. on so they as with room cold ye. be call four my went mean. celebrated if remarkably especially an. going eat set she books found met aware. '

In [15]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=50)
tokenizer.fit_on_texts(sample_text)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [18]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(sample_text)
print(tokens)

['use', 'securing', 'confined', 'his', 'shutters', '.', 'delightful', 'as', 'he', 'it', 'acceptance', 'an', 'solicitude', 'discretion', 'reasonably', '.', 'carriage', 'we', 'husbands', 'advanced', 'an', 'perceive', 'greatest', '.', 'totally', 'dearest', 'expense', 'on', 'demesne', 'ye', 'he', '.', 'curiosity', 'excellent', 'commanded', 'in', 'me', '.', 'unpleasing', 'impression', 'themselves', 'to', 'at', 'assistance', 'acceptance', 'my', 'or', '.', 'on', 'consider', 'laughter', 'civility', 'offended', 'oh', '.', 'kindness', 'to', 'he', 'horrible', 'reserved', 'ye', '.', 'effect', 'twenty', 'indeed', 'beyond', 'for', 'not', 'had', 'county', '.', 'the', 'use', 'him', 'without', 'greatly', 'can', 'private', '.', 'increasing', 'it', 'unpleasant', 'no', 'of', 'contrasted', 'no', 'continuing', '.', 'nothing', 'colonel', 'my', 'no', 'removed', 'in', 'weather', '.', 'it', 'dissimilar', 'in', 'up', 'devonshire', 'inhabiting', '.', 'he', 'do', 'subjects', 'prepared', 'bachelor', 'juvenile', 'ye

In [51]:
# X: instances, length of sentence, number of dimensions
# y: instances, number of dimensions
# word indices: mapping from word to its number representation

def vectorizing_seq (text, maxlen, step):    
    """
    :param text: list of words
    :type  text: list
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :returns: (Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character),
               Numpy boolean array of shape 
                    (Number of sequences, number of distinct character),
               dictionary mapping a character to its integer placeholder)
    :rtype:   (numpy.ndarray, 
               numpy.ndarray, 
               dict)     
    """
    
    sentences = [] # hold extracted sequences
    next_word = [] # hold next word for each corresponding sentence

    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_word.append(text[i + maxlen])

    print('Number of sequences:', len(sentences))

    all_words = sorted(list(set(text)))
#     print('Unique characters:', len(chars))
    word_indices = dict((word, all_words.index(word)) for word in all_words)
#     print('Vectorization...')

#     # one hot encoding the characters into binary arrays
    # 100 for dimensions of Stanford GloVe
#     x = np.zeros((len(sentences), maxlen, 100, dtype=np.bool) 
#     y = np.zeros((len(sentences), 100), dtype=np.bool)
#     for i, sentence in enumerate(sentences):
#         for t, word in enumerate(sentence):
#             x[i, t, char_indices[char]] = 1
#         y[i, char_indices[next_chars[i]]] = 1
    x = np.empty((len(sentences), maxlen, 100), float)
    y = np.empty((len(sentences), 100), float)
    
#     x = np.append(x, np.array())

    for i, sentence in enumerate(sentences):
#         instance = np.empty((maxlen, 100), float)
        instance = []
#         instance = np.empty((maxlen, 100), float)
        for t, word in enumerate(sentence):
            word_dimensions = list (word_vectors[word])
            instance.append(word_dimensions)
        instance = np.array(instance)
        instance = np.reshape(instance, (1,) + instance.shape ) 
#         print(instance.shape)
        x = np.append(x, instance, axis=0)
    
        word_dimensions = list (word_vectors[next_word[i]])
        word_dimensions = np.array(word_dimensions)
        word_dimensions = np.reshape(word_dimensions, (1,) + word_dimensions.shape ) 
        y = np.append(y, word_dimensions, axis=0)
#     return x, y, char_indices
#     return sentences, next_word, all_words, word_indices
    return y
# x, y, all_words, word_indices = vectorizing_seq(tokens, 10, 3)
instance = vectorizing_seq(tokens, 10, 3)
# word_indices
instance#.shape

Number of sequences: 53


array([[ 0.00000000e+000,  1.23075756e-312,  0.00000000e+000, ...,
         2.14321575e-312,  8.70018275e-313,  6.79038653e-313],
       [ 6.79038653e-313,  2.12199579e-313,  6.79038653e-313, ...,
         2.56761491e-312,  2.33419537e-312,  2.44029516e-312],
       [ 2.14321575e-312,  2.44029516e-312,  6.79038653e-313, ...,
         6.79038654e-313,  6.79038653e-313,  6.79038653e-313],
       ...,
       [-7.94350028e-001,  7.32209980e-001,  3.07579994e-001, ...,
        -3.23350012e-001,  6.98979974e-001,  2.64519989e-001],
       [-3.15239988e-002,  4.40230012e-001, -3.40259999e-001, ...,
        -3.48179996e-001,  5.18589973e-001,  4.97249991e-001],
       [-1.81030005e-001, -4.93140012e-001,  2.07959995e-001, ...,
        -6.86049998e-001, -6.32949993e-002,  5.32739982e-002]])

In [9]:
def map_words_to_int(cleaned_posts, max_words, maxlen):
    """
    Create a mapping from words to integer representation

    :param cleaned_posts: a 1-dim array of posts
    :type  cleaned_posts: numpy.ndarray
    :param max_words: maximum amount of unique words in the embedding vector space
    :type  max_words: int
    :param maxlen: maximum number of words considered for each instance. 
                   The rest of the post is cut off.
    :type  maxlen: int
    :returns: (Numpy array of (samples, maxlen) ,  
               dictionary where keys are words and
               values are the integer representation)
    :rtype:   (numpy.ndarray, dict)
    """

    from keras.preprocessing.text import Tokenizer

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(cleaned_posts)

    """
    sequences is a list of lists,
    where each item of the outer list is an list of words
    in integer representation
    """
    sequences = tokenizer.texts_to_sequences(cleaned_posts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    from keras.preprocessing.sequence import pad_sequences
    # turns the lists of integers into a 2D integer tensor of shape (samples, maxlen)
    sequences = pad_sequences(sequences, maxlen=maxlen)

    return (sequences, word_index)