In [2]:
"""
My Plan:

1. Combine all the text together into one long file (one long string).
2. Lowercase all the words (one long string)
3. Tokenize the words. (list of words split by spaces.)
4. Split into two lists, one that holds the sentence (input), 
   another that holds the predicted next word (label).
5. Convert the training sentences into vector representations.
6. One hot encode the labels.

"""

'\nMy Plan:\n\n1. Combine all the text together into one long file (one long string).\n2. Lowercase all the words (one long string)\n3. Tokenize the words. (list of words split by spaces.)\n4. Split into two lists, one that holds the sentence (input), \n   another that holds the predicted next word (label).\n5. Convert the training sentences into vector representations.\n6. One hot encode the labels.\n\n'

# Imports

In [3]:
import pandas as pd
import numpy as np
import os
import json
import random
import sys

from keras import layers
from keras.models import Sequential
from keras import optimizers
from nltk.tokenize import word_tokenize
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, \
    EarlyStopping, ReduceLROnPlateau, TensorBoard
# import nltk

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Training

## Stanford's Word2Vec (100 dimensions)

In [4]:
def get_GloVe(directory):
    """
    Open Stanford's GloVe file with 100 dimensional embeddings
    
    :param directory: directory of the GloVe
    :type  directory: str
    :return: dictionary where the keys are the words, 
             and values are the 100d representation
    :rtype:  dict
    """

    glove_dir = directory

    # dictionary that maps words into 100d array
    embeddings_index = {}
    file = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))

    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    file.close()
    print('Found %s word vectors.' % len(embeddings_index))

    return embeddings_index

## Vectorize Text

In [43]:
def vectorizing_seq (text, maxlen, step):    
    """
    Convert the text into inputs and labels.
    
    :param text: list of words
    :type  text: list
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :returns: (Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character),
               Numpy boolean array of shape 
                    (Number of sequences, number of distinct character),
               dictionary mapping a integer to its character placeholder)
    :rtype:   (numpy.ndarray, 
               numpy.ndarray, 
               dict)     
    """
    
    sentences = [] # hold extracted sequences
    next_word = [] # hold next word for each corresponding sentence

    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_word.append(text[i + maxlen])

    print('Number of sequences:', len(sentences))

    all_words = sorted(list(set(text)))
    
    word_indices = {}
    for word in all_words:
        try:
            word_indices[word] = word_vectors[word]
        except KeyError:
            word_indices[word] = np.zeros(100)
            
    x = np.empty((0, maxlen, 100), float)
    y = np.array (next_word)
    
    import time

    start = time.time()

    for i, sentence in enumerate(sentences):
        if (i % 1000 == 0):
            print("Step: ", i, "\n")
            end = time.time()
            print(end - start)
            start = time.time()
            
        instance = []
        for t, word in enumerate(sentence):
            word_dimensions = list (word_indices[word])
            instance.append(word_dimensions)
        instance = np.array(instance)
        instance = np.reshape(instance, (1,) + instance.shape ) 
        x = np.append(x, instance, axis=0)

    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(sparse=False)
    y = enc.fit_transform(y.reshape(-1,1))
    
    needed_words = enc.categories_[0]
    word_indices2 = dict(( i, word) for i, word in enumerate (needed_words))
    return x, y, word_indices, word_indices2

In [37]:
# word_indices

## Create the Model

In [6]:
weight_path = "{}_weights.best.hdf5".format('RNN')

checkpoint = ModelCheckpoint(
    weight_path,
    monitor='loss',
    verbose=1,
    save_best_only=True,
    mode='min',
    save_weights_only=True
)

early = EarlyStopping(
    monitor="loss",
    #monitor='acc'
    mode="min",
    verbose=2,
    # training is interrupted when the monitor argument stops improving after n steps
    patience=5
)

callbacks_list = [checkpoint, early]

In [7]:
def create_model(x, y, maxlen, epochs, dimensions):
    """
    Creates and trains a model.
    :param x: Numpy boolean array of shape 
                    (Number of sequences, maxlen, number of distinct character)
    :type  x: numpy.ndarray
    :param y: Numpy boolean array of shape 
                    (Number of sequences, number of distinct character)
    :type  y: numpy.ndarray
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param epochs: number of training iterations
    :type  epochs: int
    :param chars: list of unique characters
    :type  chars: list
    :returns: trained keras model
    :rtype:   keras.engine.sequential.Sequential
    """

    model = Sequential()
    model.add(layers.GRU(
        32,
        return_sequences=True,
        input_shape=(maxlen, 100))
    )
    model.add(layers.GRU(
        64,
        input_shape=(maxlen, dimensions))
    )
    model.add(layers.Dense(
        dimensions, 
        activation='softmax')
    )

    print(model.summary())

    optimizer = optimizers.RMSprop(lr=0.01)
#     optimizer = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
#     model.compile(loss='mse', optimizer=optimizer)
    model.fit(x, y, batch_size=128, epochs=epochs, callbacks=callbacks_list)

    return (model)

## Pipeline

In [8]:
def train_model_from_text(text, maxlen=10, step=20, epochs=10):
    """
    Given text, train the model.
    
    :param text: A string with all the text together.
    :type  text: str
    :param maxlen: the length of a sequence to extract as train
    :type  maxlen: int
    :param step: sample a new sequence every n steps
    :type  step: int
    :param epochs: number of training iterations
    :type  epochs: int
    :returns: (trained keras model,
               dictionary mapping characters to digit representations)
    :rtype:   (keras.engine.sequential.Sequential,
               dict)
    """
    
    x, y, wordvectors_mini, word_indices = vectorizing_seq(tokens, maxlen, 3)
    model = create_model(x, y, maxlen, 3, y.shape[1])
    
    return model, word_indices, wordvectors_mini

In [None]:
x, y, wordvectors_mini, word_indices = vectorizing_seq(tokens, maxlen, 3)

In [None]:
# sys.getsizeof (wordvectors_mini)

# Generate Text

## Redistribute Probability Distributions

In [None]:
def sample(preds, temperature=1.0):
    """
    Compute new probability distribution based on the temperature
    Higher temperature creates more randomness.
    
    :param preds: numpy array of shape (unique chars,), and elements sum to 1
    :type  preds: numpy.ndarray
    :param temperature: characterizes the entropy of probability distribution
    :type  temperature: float
    :returns: a number 0 to the length of preds - 1
    :rtype:   int
    """
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## Generate Text

In [None]:
def text_generate(model, text, word_indices, maxlen=10, temperature=1.0, textlen=40):
    """
    Generate text based on a model.
    
    :param model: trained keras model
    :type  model: keras.engine.sequential.Sequential
    :param text: lyrics
    :type  text: str
    :param char_indices: dictionary mapping a character to its integer placeholder
    :type  char_indices: dict
    :param maxlen: maximum length of the sequences
    :type  maxlen: int
    :param textlen: Number of characters of generated sequence
    :type  textlen: int
    """
    
    start_index = random.randint(0, len(text) - maxlen - 1) 
    generated_text = text[start_index: start_index + maxlen] 
    full_sentence = " ".join (generated_text)
    print(len(generated_text))
    print('--- Generating with seed: "' + full_sentence + '"')
    
    print('------ temperature:', temperature)
    sys.stdout.write(full_sentence)
    
    
    for i in range(textlen):
        
        sampled = []
        for t, word in enumerate(generated_text):
            word_dimensions = list (wordvectors_mini[word])
            sampled.append(word_dimensions)
        sampled = np.array(sampled)
        sampled = np.reshape(sampled, (1,) + sampled.shape ) 

        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_word = word_indices[next_index]

        generated_text.append ( next_word)
        generated_text = generated_text[1:]
        sys.stdout.write(" " + next_word)


In [38]:
directory = '/Users/jinli/Projects/glove.6B'
word_vectors = get_GloVe(directory)

Found 400000 word vectors.


In [31]:
import re
text = open('all.txt', 'r').read()
text = text.lower()
text = re.sub(r'[><*^$%|&()@#-+_=//]', ' ', text)
text = re.sub("\d+", "number", text)
text = re.sub(r'http\S+', ' ', text)
text = re.sub(r'-', ' ', text)
# text = re.sub(r'..', '.', text)

tokens = word_tokenize(text)
# small amount for now
# tokens = token[:4000]

In [44]:
%%time
x, y, wordvectors_mini, word_indices = vectorizing_seq (tokens, 10, 2)

Number of sequences: 111109
Step:  0 

0.0008373260498046875
Step:  1000 

3.322455883026123
Step:  2000 

9.055814981460571
Step:  3000 

15.990724086761475
Step:  4000 

22.523401975631714
Step:  5000 

30.41037607192993
Step:  6000 

40.030396938323975
Step:  7000 

47.04581022262573
Step:  8000 

54.498684883117676
Step:  9000 

65.91455578804016
Step:  10000 

83.3607542514801
Step:  11000 

89.6272349357605
Step:  12000 

100.43067693710327
Step:  13000 

113.73803901672363
Step:  14000 

129.59064412117004
Step:  15000 

112.75604605674744
Step:  16000 

120.13941192626953
Step:  17000 

118.81788396835327
Step:  18000 

92.31554889678955
Step:  19000 

98.27107906341553
Step:  20000 

102.60519790649414
Step:  21000 

109.24085879325867
Step:  22000 

111.20394325256348
Step:  23000 

117.56324195861816
Step:  24000 

121.9767050743103


KeyboardInterrupt: 

In [33]:
# sorted (set (tokens))#.keys()

In [None]:
# text[:10000]

In [None]:
# token
# maxlen = 10
# x, y, word_indices = vectorizing_seq(tokens, maxlen, 3)
word_indices

In [None]:
# model = create_model(x, y, maxlen, epochs=3, dimensions=y.shape[1])
# model, word_indices, wordvectors_mini = train_model_from_text(tokens)

In [None]:
text_generate(model, tokens, word_indices)

In [None]:
# to load the model
#model = load_model('my_model.h5')

## Finding Themes

In [None]:
text[:5000]

In [None]:
# nltk_text = nltk.Text(text)

In [None]:
# nltk_text#[:500]

In [None]:
# nltk_text.concordance('s')

In [None]:
# import re
# testzz = [m.start() for m in re.finditer('pizza', tokens)]
# testzz = np.where(np.array(tokens) == 'it')[0]
# random.choice(testzz)

In [None]:
# tokens[:200]

def find_random_sentence(tokens, word, maxlen):
    list_of_appearance = np.where(np.array(tokens) == word)[0]
    stop_characters = set({'...', '.', '?', '!'})
    random_index = random.choice(list_of_appearance)
    index = random_index
    
    sentence = []
    while (tokens[index] not in stop_characters):
        sentence.append(tokens[index])
        index += 1
    sentence.append(tokens[index])
    
    index = random_index
    
    while ( (tokens[index] not in stop_characters) or len(sentence) < 11):
        sentence.insert(0, tokens[index])
        index -= 1
    
    return sentence[:maxlen]

In [None]:
def text_generate_with_word(
    model, 
    text, 
    word_indices,
    word,
    maxlen=10, 
    temperature=1.0,
    textlen=40):
    """
    Generate text based on a model.
    The starting seed is based on a word input 
    
    :param model: trained keras model
    :type  model: keras.engine.sequential.Sequential
    :param text: lyrics
    :type  text: str
    :param char_indices: dictionary mapping a character to its integer placeholder
    :type  char_indices: dict
    :param word: the input starting word
    :type  word: str
    :param maxlen: maximum length of the sequences
    :type  maxlen: int
    :param textlen: Number of characters of generated sequence
    :type  textlen: int
    """
    
    stop_characters = set({'...', '.', '?', '!'})
    
    generated_text = find_random_sentence(tokens, word, maxlen)
    full_sentence = " ".join (generated_text)
    print(len(generated_text))
    print('--- Generating with seed: "' + full_sentence + '"')
    
    print('------ temperature:', temperature)
    sys.stdout.write(full_sentence)
    
    out_text = generated_text
    
#     for i in range(textlen):
    stop_generate = False
    i = 0
    while ( (i < textlen) or (not stop_generate) ):
        
        sampled = []
        for t, word in enumerate(generated_text):
            word_dimensions = list (wordvectors_mini[word])
            sampled.append(word_dimensions)
        sampled = np.array(sampled)
        sampled = np.reshape(sampled, (1,) + sampled.shape ) 

        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, temperature)
        next_word = word_indices[next_index]

        generated_text.append ( next_word)
        generated_text = generated_text[1:]
        sys.stdout.write(" " + next_word)
        out_text.append(next_word)
        
        if (next_word in stop_characters):
            stop_generate = True
        i += 1
    return out_text


In [None]:
# set('aaabb')
# find_random_sentence(tokens, 'it')
text_generate_with_word(model, tokens, word_indices, 'pizza')

In [None]:
set({'...'})

In [None]:
# tokens[:200]