In [None]:
"""
My Plan:

1. Combine all the text together into one long file (one long string).
2. Lowercase all the words (one long string)
3. Tokenize the words. (list of words split by spaces.)
4. Split into two lists, one that holds the sentence (input), 
   another that holds the predicted next word (label).
5. Convert the training sentences into vector representations.
6. One hot encode the labels.

"""

# Imports

In [7]:
import pandas as pd
import numpy as np
import os
import json
import random
import sys

from keras import layers
from keras.models import Sequential
from keras import optimizers
from nltk.tokenize import word_tokenize

# Stanford's Word2Vec (100 dimensions)

In [2]:
def get_GloVe(directory):
    """
    Open Stanford's GloVe file with 100 dimensional embeddings
    
    :param directory: directory of the GloVe
    :type  directory: str
    :return: dictionary where the keys are the words, 
             and values are the 100d representation
    :rtype:  dict
    """

    glove_dir = directory

    # dictionary that maps words into 100d array
    embeddings_index = {}
    file = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))

    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    file.close()
    print('Found %s word vectors.' % len(embeddings_index))

    return embeddings_index

# Vectorize Text

In [11]:
def vectorizing_seq (text, maxlen, step):    
    """
    Convert the text into inputs and labels.
    
    :param text: list of words
    :type  text: list
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :returns: (Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character),
               Numpy boolean array of shape 
                    (Number of sequences, number of distinct character),
               dictionary mapping a integer to its character placeholder)
    :rtype:   (numpy.ndarray, 
               numpy.ndarray, 
               dict)     
    """
    
    sentences = [] # hold extracted sequences
    next_word = [] # hold next word for each corresponding sentence

    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_word.append(text[i + maxlen])

    print('Number of sequences:', len(sentences))

    all_words = sorted(list(set(text)))
    
    word_indices = {}
    for word in all_words:
        try:
            word_indices[word] = word_vectors[word]
        except KeyError:
            word_indices[word] = np.zeros(100)
            
    x = np.empty((0, maxlen, 100), float)
    y = np.array (next_word)

    for i, sentence in enumerate(sentences):
        instance = []
        for t, word in enumerate(sentence):
            word_dimensions = list (word_indices[word])
            instance.append(word_dimensions)
        instance = np.array(instance)
        instance = np.reshape(instance, (1,) + instance.shape ) 
        x = np.append(x, instance, axis=0)

    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(sparse=False)
    y = enc.fit_transform(y.reshape(-1,1))
    
    needed_words = enc.categories_[0]
    word_indices2 = dict(( i, word) for i, word in enumerate (needed_words))
    return x, y, word_indices2

# Create the Model

In [None]:
def create_model(x, y, maxlen, epochs, dimensions):
    """
    Creates and trains a model.
    :param x: Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character)
    :type  x: numpy.ndarray
    :param y: Numpy boolean array of shape 
                    (Number of sequences, number of distinct character)
    :type  y: numpy.ndarray
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param epochs: number of training iterations
    :type  epochs: int
    :param chars: list of unique characters
    :type  chars: list
    :returns: trained keras model
    :rtype:   keras.engine.sequential.Sequential
    """

    model = Sequential()
    model.add(layers.GRU(
        32,
        return_sequences=True,
        input_shape=(maxlen, 100))
    )
    model.add(layers.GRU(
        64,
        input_shape=(maxlen, dimensions))
    )
    model.add(layers.Dense(
        dimensions, 
        activation='softmax')
    )

    print(model.summary())

    optimizer = optimizers.RMSprop(lr=0.01)
#     optimizer = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
#     model.compile(loss='mse', optimizer=optimizer)
    model.fit(x, y, batch_size=128, epochs=epochs)

    return (model)

In [3]:
directory = '/Users/jinli/Projects/glove.6B'
word_vectors = get_GloVe(directory)

Found 400000 word vectors.


In [13]:
text = open('all.txt', 'r').read()
text = text.lower()

tokens = word_tokenize(text)
# small amount for now
tokens = token[:4000]

In [14]:
# token
maxlen = 10
x, y, word_indices = vectorizing_seq(tokens, maxlen, 3)

Number of sequences: 1330
