In [None]:
"""
My Plan:

1. Combine all the text together into one long file (one long string).
2. Lowercase all the words (one long string)
3. Tokenize the words. (list of words split by spaces.)
4. Split into two lists, one that holds the sentence (input), 
   another that holds the predicted next word (label).
5. Convert the training sentences into vector representations.
6. One hot encode the labels.

"""

# Imports

In [73]:
import pandas as pd
import numpy as np
import os
import json
import random
import sys

from keras import layers
from keras.models import Sequential
from keras import optimizers
from nltk.tokenize import word_tokenize
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, \
    EarlyStopping, ReduceLROnPlateau, TensorBoard
# import nltk

# Training

## Stanford's Word2Vec (100 dimensions)

In [2]:
def get_GloVe(directory):
    """
    Open Stanford's GloVe file with 100 dimensional embeddings
    
    :param directory: directory of the GloVe
    :type  directory: str
    :return: dictionary where the keys are the words, 
             and values are the 100d representation
    :rtype:  dict
    """

    glove_dir = directory

    # dictionary that maps words into 100d array
    embeddings_index = {}
    file = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))

    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    file.close()
    print('Found %s word vectors.' % len(embeddings_index))

    return embeddings_index

## Vectorize Text

In [32]:
def vectorizing_seq (text, maxlen, step):    
    """
    Convert the text into inputs and labels.
    
    :param text: list of words
    :type  text: list
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :returns: (Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character),
               Numpy boolean array of shape 
                    (Number of sequences, number of distinct character),
               dictionary mapping a integer to its character placeholder)
    :rtype:   (numpy.ndarray, 
               numpy.ndarray, 
               dict)     
    """
    
    sentences = [] # hold extracted sequences
    next_word = [] # hold next word for each corresponding sentence

    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_word.append(text[i + maxlen])

    print('Number of sequences:', len(sentences))

    all_words = sorted(list(set(text)))
    
    word_indices = {}
    for word in all_words:
        try:
            word_indices[word] = word_vectors[word]
        except KeyError:
            word_indices[word] = np.zeros(100)
            
    x = np.empty((0, maxlen, 100), float)
    y = np.array (next_word)

    for i, sentence in enumerate(sentences):
        instance = []
        for t, word in enumerate(sentence):
            word_dimensions = list (word_indices[word])
            instance.append(word_dimensions)
        instance = np.array(instance)
        instance = np.reshape(instance, (1,) + instance.shape ) 
        x = np.append(x, instance, axis=0)

    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(sparse=False)
    y = enc.fit_transform(y.reshape(-1,1))
    
    needed_words = enc.categories_[0]
    word_indices2 = dict(( i, word) for i, word in enumerate (needed_words))
    return x, y, word_indices, word_indices2

In [33]:
# word_indices

## Create the Model

In [64]:
weight_path = "{}_weights.best.hdf5".format('RNN')

checkpoint = ModelCheckpoint(
    weight_path,
    monitor='loss',
    verbose=1,
    save_best_only=True,
    mode='min',
    save_weights_only=True
)

early = EarlyStopping(
    monitor="loss",
    #monitor='acc'
    mode="min",
    verbose=2,
    # training is interrupted when the monitor argument stops improving after n steps
    patience=5
)

callbacks_list = [checkpoint, early]

In [65]:
def create_model(x, y, maxlen, epochs, dimensions):
    """
    Creates and trains a model.
    :param x: Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character)
    :type  x: numpy.ndarray
    :param y: Numpy boolean array of shape 
                    (Number of sequences, number of distinct character)
    :type  y: numpy.ndarray
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param epochs: number of training iterations
    :type  epochs: int
    :param chars: list of unique characters
    :type  chars: list
    :returns: trained keras model
    :rtype:   keras.engine.sequential.Sequential
    """

    model = Sequential()
    model.add(layers.GRU(
        32,
        return_sequences=True,
        input_shape=(maxlen, 100))
    )
    model.add(layers.GRU(
        64,
        input_shape=(maxlen, dimensions))
    )
    model.add(layers.Dense(
        dimensions, 
        activation='softmax')
    )

    print(model.summary())

    optimizer = optimizers.RMSprop(lr=0.01)
#     optimizer = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
#     model.compile(loss='mse', optimizer=optimizer)
    model.fit(x, y, batch_size=128, epochs=epochs, callbacks=callbacks_list)

    return (model)

## Pipeline

In [66]:
def train_model_from_text(text, maxlen=10, step=20, epochs=10):
    """
    Given text, train the model.
    
    :param text: A string with all the text together.
    :type  text: str
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :param epochs: number of training iterations
    :type  epochs: int
    :returns: (trained keras model,
               dictionary mapping characters to digit representations)
    :rtype:   (keras.engine.sequential.Sequential,
               dict)
    """
    
    x, y, wordvectors_mini, word_indices = vectorizing_seq(tokens, maxlen, 3)
    model = create_model(x, y, maxlen, 3, y.shape[1])
    
    return model, word_indices, wordvectors_mini

In [154]:
sys.getsizeof (wordvectors_mini)

36968

# Generate Text

## Redistribute Probability Distributions

In [45]:
def sample(preds, temperature=1.0):
    """
    Compute new probability distribution based on the temperature
    Higher temperature creates more randomness.
    
    :param preds: numpy array of shape (unique chars,), and elements sum to 1
    :type  preds: numpy.ndarray
    :param temperature: characterizes the entropy of probability distribution
    :type  temperature: float
    :returns: a number 0 to the length of preds - 1
    :rtype:   int
    """
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## Generate Text

In [46]:
def text_generate(model, text, word_indices, maxlen=10, temperature=1.0, textlen=40):
    """
    Generate text based on a model.
    
    :param model: trained keras model
    :type  model: keras.engine.sequential.Sequential
    :param text: lyrics
    :type  text: str
    :param char_indices: dictionary mapping a character to its integer placeholder
    :type  char_indices: dict
    :param maxlen: maximum length of the sequences
    :type  maxlen: int
    :param textlen: Number of characters of generated sequence
    :type  textlen: int
    """
    
    start_index = random.randint(0, len(text) - maxlen - 1) 
    generated_text = text[start_index: start_index + maxlen] 
    full_sentence = " ".join (generated_text)
    print(len(generated_text))
    print('--- Generating with seed: "' + full_sentence + '"')
    
    print('------ temperature:', temperature)
    sys.stdout.write(full_sentence)
    
    
    for i in range(textlen):
        
        sampled = []
        for t, word in enumerate(generated_text):
            word_dimensions = list (wordvectors_mini[word])
            sampled.append(word_dimensions)
        sampled = np.array(sampled)
        sampled = np.reshape(sampled, (1,) + sampled.shape ) 

        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_word = word_indices[next_index]

        generated_text.append ( next_word)
        generated_text = generated_text[1:]
        sys.stdout.write(" " + next_word)


In [47]:
directory = '/Users/jinli/Projects/glove.6B'
word_vectors = get_GloVe(directory)

Found 400000 word vectors.


In [48]:
text = open('all.txt', 'r').read()
text = text.lower()

tokens = word_tokenize(text)
# small amount for now
tokens = token[:4000]

In [70]:
# text[:10000]

In [152]:
# token
# maxlen = 10
# x, y, word_indices = vectorizing_seq(tokens, maxlen, 3)
word_indices

{0: '!',
 1: '#',
 2: '%',
 3: '(',
 4: ')',
 5: '*rotational',
 6: '*still*',
 7: '*unsprung',
 8: '+',
 9: ',',
 10: '-',
 11: '.',
 12: '...',
 13: '//i.imgur.com/qnniqul.jpg',
 14: '1',
 15: '1,000',
 16: '10',
 17: '1018',
 18: '2.',
 19: '20',
 20: '40',
 21: '40.',
 22: '5',
 23: '6',
 24: '8',
 25: '95',
 26: ':',
 27: '=',
 28: '?',
 29: 'a',
 30: 'able',
 31: 'about',
 32: 'absorbed',
 33: 'accept',
 34: 'accidents',
 35: 'ache',
 36: 'actual',
 37: 'actually',
 38: 'ad',
 39: 'adderall',
 40: 'afraid',
 41: 'africa',
 42: 'after',
 43: 'again',
 44: 'agree',
 45: 'all',
 46: 'allow',
 47: 'almost',
 48: 'along',
 49: 'already',
 50: 'always',
 51: 'am',
 52: 'an',
 53: 'and',
 54: 'annoyed',
 55: 'any',
 56: 'applause',
 57: 'are',
 58: 'arse',
 59: 'as',
 60: 'ash',
 61: 'at',
 62: 'ate',
 63: 'attention',
 64: 'avoidance',
 65: 'away',
 66: 'back',
 67: 'bad',
 68: 'based',
 69: 'be',
 70: 'because',
 71: 'bed',
 72: 'been',
 73: 'before',
 74: 'began',
 75: 'being',
 76: 

In [67]:
# model = create_model(x, y, maxlen, epochs=3, dimensions=y.shape[1])
model, word_indices, wordvectors_mini = train_model_from_text(tokens)

Number of sequences: 1330
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_13 (GRU)                 (None, 10, 32)            12768     
_________________________________________________________________
gru_14 (GRU)                 (None, 64)                18624     
_________________________________________________________________
dense_7 (Dense)              (None, 598)               38870     
Total params: 70,262
Trainable params: 70,262
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3

Epoch 00001: loss improved from inf to 6.21399, saving model to CNN2_weights.best.hdf5
Epoch 2/3

Epoch 00002: loss improved from 6.21399 to 5.82090, saving model to CNN2_weights.best.hdf5
Epoch 3/3

Epoch 00003: loss improved from 5.82090 to 5.68270, saving model to CNN2_weights.best.hdf5


In [55]:
text_generate(model, tokens, word_indices)

10
--- Generating with seed: "ve had a coffee , the key is not to"
------ temperature: 1.0
ve had a coffee , the key is not to be walking right i think think when . via by the ever is high ill snickers story in has limo master inter-connecting gameplay hand ive male next previous feelings profit seen down be by internet told along inter-connecting suffered of

In [None]:
# to load the model
#model = load_model('my_model.h5')

## Finding Themes

In [94]:
text[:5000]

'did you have your showerhead plugged in? or is it a wireless model?\nyep. and when you finally switch to another career, it really throws you off when your coworkers measure years differently. \nthats a nice dark thought that will now pop into my head during my next birthday...\ni want to be cremated and then put into one of those cardboard pods that grows into a tree so one day i will be cut down and turned into someone elses coffin.\nwould they think it’s cool if it was vomit?\nu the only one thinking that bro\ni wanted to get to know somebody better, so i asked them how their vaction went in mexico - simple enough right?   she told me to follow her instagram, where i could find out for myself.  at least it saved me the effort of getting to know them in the first place?  edit: i know, she probably wasnt interested in getting to know me, but a simple it was good would have gotten the message across just as well... \nits just the outside catching up with the inside.\ni am sorry detect

In [75]:
# nltk_text = nltk.Text(text)

In [78]:
# nltk_text#[:500]

<Text: d i d   y o u  ...>

In [82]:
# nltk_text.concordance('s')

In [103]:
# import re
# testzz = [m.start() for m in re.finditer('pizza', tokens)]
# testzz = np.where(np.array(tokens) == 'it')[0]
# random.choice(testzz)

2660

In [144]:
# tokens[:200]

def find_random_sentence(tokens, word, maxlen):
    list_of_appearance = np.where(np.array(tokens) == word)[0]
    stop_characters = set({'...', '.', '?', '!'})
    random_index = random.choice(list_of_appearance)
    index = random_index
    
    sentence = []
    while (tokens[index] not in stop_characters):
        sentence.append(tokens[index])
        index += 1
    sentence.append(tokens[index])
    
    index = random_index
    
    while ( (tokens[index] not in stop_characters) or len(sentence) < 11):
        sentence.insert(0, tokens[index])
        index -= 1
    
    return sentence[:maxlen]

In [145]:
def text_generate_with_word(
    model, 
    text, 
    word_indices,
    word,
    maxlen=10, 
    temperature=1.0,
    textlen=40):
    """
    Generate text based on a model.
    The starting seed is based on a word input 
    
    :param model: trained keras model
    :type  model: keras.engine.sequential.Sequential
    :param text: lyrics
    :type  text: str
    :param char_indices: dictionary mapping a character to its integer placeholder
    :type  char_indices: dict
    :param word: the input starting word
    :type  word: str
    :param maxlen: maximum length of the sequences
    :type  maxlen: int
    :param textlen: Number of characters of generated sequence
    :type  textlen: int
    """
    
    stop_characters = set({'...', '.', '?', '!'})
    
    generated_text = find_random_sentence(tokens, word, maxlen)
    full_sentence = " ".join (generated_text)
    print(len(generated_text))
    print('--- Generating with seed: "' + full_sentence + '"')
    
    print('------ temperature:', temperature)
    sys.stdout.write(full_sentence)
    
    out_text = generated_text
    
#     for i in range(textlen):
    stop_generate = False
    i = 0
    while ( (i < textlen) or (not stop_generate) ):
        
        sampled = []
        for t, word in enumerate(generated_text):
            word_dimensions = list (wordvectors_mini[word])
            sampled.append(word_dimensions)
        sampled = np.array(sampled)
        sampled = np.reshape(sampled, (1,) + sampled.shape ) 

        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_word = word_indices[next_index]

        generated_text.append ( next_word)
        generated_text = generated_text[1:]
        sys.stdout.write(" " + next_word)
        out_text.append(next_word)
        
        if (next_word in stop_characters):
            stop_generate = True
        i += 1
    return out_text


In [151]:
# set('aaabb')
# find_random_sentence(tokens, 'it')
text_generate_with_word(model, tokens, word_indices, 'pizza')

10
--- Generating with seed: "a pizza pizza is just a modern version of a"
------ temperature: 1.0
a pizza pizza is just a modern version of a 5 this am dialogue that freezing us guardians sleep that 5 began ’ no pointed i im being family and this dress one specifically when only i themselves movements imagine of me before outside someone power someone ultimate im 20 ad are results shamings eat talk candy that usually for effort in up wonders knowledge the % today synonyms agree talk to your higher kinda emphysema eat up s longer one thought circle the talking //i.imgur.com/qnniqul.jpg thought its circle mario without pointed suffered thought wait me and there wait , knew could was head and 20 grounded .

['a',
 'pizza',
 'pizza',
 'is',
 'just',
 'a',
 'modern',
 'version',
 'of',
 'a',
 '5',
 '5',
 'this',
 'am',
 'dialogue',
 'that',
 'freezing',
 'us',
 'guardians',
 'sleep',
 'that',
 '5',
 'began',
 '’',
 'no',
 'pointed',
 'i',
 'im',
 'being',
 'family',
 'and',
 'this',
 'dress',
 'one',
 'specifically',
 'when',
 'only',
 'i',
 'themselves',
 'movements',
 'imagine',
 'of',
 'me',
 'before',
 'outside',
 'someone',
 'power',
 'someone',
 'ultimate',
 'im',
 '20',
 'ad',
 'are',
 'results',
 'shamings',
 'eat',
 'talk',
 'candy',
 'that',
 'usually',
 'for',
 'effort',
 'in',
 'up',
 'wonders',
 'knowledge',
 'the',
 '%',
 'today',
 'synonyms',
 'agree',
 'talk',
 'to',
 'your',
 'higher',
 'kinda',
 'emphysema',
 'eat',
 'up',
 's',
 'longer',
 'one',
 'thought',
 'circle',
 'the',
 'talking',
 '//i.imgur.com/qnniqul.jpg',
 'thought',
 'its',
 'circle',
 'mario',
 'without',
 'pointed',
 'suffered',
 'thought',
 'wait',
 'me',
 'and',
 'there',
 'wait',
 ',',
 'knew',
 'could'

In [110]:
set({'...'})

{'...'}

In [122]:
# tokens[:200]