In [63]:
import pandas as pd
import numpy as np
import os
import json

from keras import layers
from keras.models import Sequential
from keras import optimizers

In [3]:
def get_GloVe(directory):
    """
    Open Stanford's GloVe file with 100 dimensional embeddings
    
    :param directory: directory of the GloVe
    :type  directory: str
    :return: dictionary where the keys are the words, 
             and values are the 100d representation
    :rtype:  dict
    """

    glove_dir = directory

    # dictionary that maps words into 100d array
    embeddings_index = {}
    file = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))

    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    file.close()
    print('Found %s word vectors.' % len(embeddings_index))

    return embeddings_index

In [4]:
directory = '/Users/jinli/Projects/glove.6B'
word_vectors = get_GloVe(directory)

Found 400000 word vectors.


In [32]:
word_vectors['hi'].shape

(100,)

In [None]:
"""
My Plan:

1. Combine all the text together into one long file (one long string).
2. Lowercase all the words (one long string)
3. Tokenize the words. (list of words split by spaces.)
4. Split into two lists, one that holds the sentence, another that holds the predicted next word

"""
print('')

In [5]:
sample_text = """Use securing confined his shutters. Delightful as he it acceptance an solicitude discretion reasonably. Carriage we husbands advanced an perceive greatest. Totally dearest expense on demesne ye he. Curiosity excellent commanded in me. Unpleasing impression themselves to at assistance acceptance my or. On consider laughter civility offended oh. 

Kindness to he horrible reserved ye. Effect twenty indeed beyond for not had county. The use him without greatly can private. Increasing it unpleasant no of contrasted no continuing. Nothing colonel my no removed in weather. It dissimilar in up devonshire inhabiting. 

He do subjects prepared bachelor juvenile ye oh. He feelings removing informed he as ignorant we prepared. Evening do forming observe spirits is in. Country hearted be of justice sending. On so they as with room cold ye. Be call four my went mean. Celebrated if remarkably especially an. Going eat set she books found met aware. """

In [7]:
sample_text = sample_text.lower()

In [13]:
sample_text
# with open('thoughts.txt') as f:
#     data = json.load(f)

'use securing confined his shutters. delightful as he it acceptance an solicitude discretion reasonably. carriage we husbands advanced an perceive greatest. totally dearest expense on demesne ye he. curiosity excellent commanded in me. unpleasing impression themselves to at assistance acceptance my or. on consider laughter civility offended oh. \n\nkindness to he horrible reserved ye. effect twenty indeed beyond for not had county. the use him without greatly can private. increasing it unpleasant no of contrasted no continuing. nothing colonel my no removed in weather. it dissimilar in up devonshire inhabiting. \n\nhe do subjects prepared bachelor juvenile ye oh. he feelings removing informed he as ignorant we prepared. evening do forming observe spirits is in. country hearted be of justice sending. on so they as with room cold ye. be call four my went mean. celebrated if remarkably especially an. going eat set she books found met aware. '

In [15]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=50)
tokenizer.fit_on_texts(sample_text)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [18]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(sample_text)
print(tokens)

['use', 'securing', 'confined', 'his', 'shutters', '.', 'delightful', 'as', 'he', 'it', 'acceptance', 'an', 'solicitude', 'discretion', 'reasonably', '.', 'carriage', 'we', 'husbands', 'advanced', 'an', 'perceive', 'greatest', '.', 'totally', 'dearest', 'expense', 'on', 'demesne', 'ye', 'he', '.', 'curiosity', 'excellent', 'commanded', 'in', 'me', '.', 'unpleasing', 'impression', 'themselves', 'to', 'at', 'assistance', 'acceptance', 'my', 'or', '.', 'on', 'consider', 'laughter', 'civility', 'offended', 'oh', '.', 'kindness', 'to', 'he', 'horrible', 'reserved', 'ye', '.', 'effect', 'twenty', 'indeed', 'beyond', 'for', 'not', 'had', 'county', '.', 'the', 'use', 'him', 'without', 'greatly', 'can', 'private', '.', 'increasing', 'it', 'unpleasant', 'no', 'of', 'contrasted', 'no', 'continuing', '.', 'nothing', 'colonel', 'my', 'no', 'removed', 'in', 'weather', '.', 'it', 'dissimilar', 'in', 'up', 'devonshire', 'inhabiting', '.', 'he', 'do', 'subjects', 'prepared', 'bachelor', 'juvenile', 'ye

In [61]:
# X: instances, length of sentence, number of dimensions
# y: instances, number of dimensions
# word indices: mapping from word to its number representation

def vectorizing_seq (text, maxlen, step):    
    """
    :param text: list of words
    :type  text: list
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :returns: (Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character),
               Numpy boolean array of shape 
                    (Number of sequences, number of distinct character),
               dictionary mapping a character to its integer placeholder)
    :rtype:   (numpy.ndarray, 
               numpy.ndarray, 
               dict)     
    """
    
    sentences = [] # hold extracted sequences
    next_word = [] # hold next word for each corresponding sentence

    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_word.append(text[i + maxlen])

    print('Number of sequences:', len(sentences))

    all_words = sorted(list(set(text)))
#     print('Unique characters:', len(chars))
#     word_indices = dict((word, all_words.index(word)) for word in all_words)
    word_indices = dict((word, list (word_vectors[word])) for word in all_words)
#     print('Vectorization...')

#     # one hot encoding the characters into binary arrays
    # 100 for dimensions of Stanford GloVe
#     x = np.zeros((len(sentences), maxlen, 100, dtype=np.bool) 
#     y = np.zeros((len(sentences), 100), dtype=np.bool)
#     for i, sentence in enumerate(sentences):
#         for t, word in enumerate(sentence):
#             x[i, t, char_indices[char]] = 1
#         y[i, char_indices[next_chars[i]]] = 1
    x = np.empty((len(sentences), maxlen, 100), float)
    y = np.empty((len(sentences), 100), float)
    
#     x = np.append(x, np.array())

    for i, sentence in enumerate(sentences):
#         instance = np.empty((maxlen, 100), float)
        instance = []
#         instance = np.empty((maxlen, 100), float)
        for t, word in enumerate(sentence):
            word_dimensions = list (word_vectors[word])
            instance.append(word_dimensions)
        instance = np.array(instance)
        instance = np.reshape(instance, (1,) + instance.shape ) 
#         print(instance.shape)
        x = np.append(x, instance, axis=0)
    
        word_dimensions = list (word_vectors[next_word[i]])
        word_dimensions = np.array(word_dimensions)
        word_dimensions = np.reshape(word_dimensions, (1,) + word_dimensions.shape ) 
        y = np.append(y, word_dimensions, axis=0)
#     return x, y, char_indices
#     return sentences, next_word, all_words, word_indices
    return x, y, word_indices

maxlen=10
x, y, word_indices = vectorizing_seq(tokens, maxlen, 3)
# instance = vectorizing_seq(tokens, 10, 3)
# word_indices
# instance#.shape

Number of sequences: 53


In [58]:
# word_indices.keys()

In [67]:
def create_model(x, y, maxlen, epochs, dimensions):
    """
    Creates and trains a model.
    :param x: Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character)
    :type  x: numpy.ndarray
    :param y: Numpy boolean array of shape 
                    (Number of sequences, number of distinct character)
    :type  y: numpy.ndarray
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param epochs: number of training iterations
    :type  epochs: int
    :param chars: list of unique characters
    :type  chars: list
    :returns: trained keras model
    :rtype:   keras.engine.sequential.Sequential
    """

    model = Sequential()
    model.add(layers.GRU(
        32,
        return_sequences=True,
        input_shape=(maxlen, dimensions))
    )
    model.add(layers.GRU(
        64,
        input_shape=(maxlen, dimensions))
    )
    model.add(layers.Dense(
        dimensions, 
        activation='softmax')
    )

    print(model.summary())

    optimizer = optimizers.RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    model.fit(x, y, batch_size=128, epochs=epochs)

    return (model)

In [68]:
model = create_model(x, y, maxlen, 3, 100)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_3 (GRU)                  (None, 10, 32)            12768     
_________________________________________________________________
gru_4 (GRU)                  (None, 64)                18624     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               6500      
Total params: 37,892
Trainable params: 37,892
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [73]:
def train_model_from_lyrics(lyrics, maxlen=10, step=20, epochs=10):
    """
    Given lyrics, train the model.
    
    :param lyrics: A string with all the lyrics together.
    :type  lyrics: str
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :param epochs: number of training iterations
    :type  epochs: int
    :returns: (trained keras model,
               dictionary mapping characters to digit representations)
    :rtype:   (keras.engine.sequential.Sequential,
               dict)
    """
    
    x, y, word_indices = vectorizing_seq(tokens, maxlen, 3)
#     x, y, char_indices = vectorizing_seq(lyrics, maxlen, step)
#     chars = list (char_indices.keys())
    model = create_model(x, y, maxlen, 3, 100)
#     model = create_model(x, y, maxlen, epochs, chars)
    
    return model, word_indices

model, word_indices = train_model_from_lyrics(tokens)

Number of sequences: 53
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_9 (GRU)                  (None, 10, 32)            12768     
_________________________________________________________________
gru_10 (GRU)                 (None, 64)                18624     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               6500      
Total params: 37,892
Trainable params: 37,892
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [74]:
def sample(preds, temperature=1.0):
    """
    Compute new probability distribution based on the temperature
    Higher temperature creates more randomness.
    
    :param preds: numpy array of shape (unique chars,), and elements sum to 1
    :type  preds: numpy.ndarray
    :param temperature: characterizes the entropy of probability distribution
    :type  temperature: float
    :returns: a number 0 to the length of preds - 1
    :rtype:   int
    """
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [75]:
def text_generate(model, text, char_indices, maxlen=60, temperature=1.0, textlen=400):
    """
    Generate text based on a model.
    
    :param model: trained keras model
    :type  model: keras.engine.sequential.Sequential
    :param text: lyrics
    :type  text: str
    :param char_indices: dictionary mapping a character to its integer placeholder
    :type  char_indices: dict
    :param maxlen: maximum length of the sequences
    :type  maxlen: int
    :param textlen: Number of characters of generated sequence
    :type  textlen: int
    """

    start_index = random.randint(0, len(text) - maxlen - 1) 
    generated_text = text[start_index: start_index + maxlen] 
    full_sentence = " ".join (generated_text)
    print('--- Generating with seed: "' + full_sentence + '"')
    
    chars = list (char_indices.keys())
    
    print('------ temperature:', temperature)
    sys.stdout.write(generated_text)
    for i in range(textlen):
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        generated_text += next_char
        generated_text = generated_text[1:]
        sys.stdout.write(next_char)
