In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Input, Embedding, Dense
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# configurations
EPOCHS = 2000
BATCH_SIZE = 128
VALIDATION_SPLIT = 0.2
MAX_VOCAB_SIZE = 3000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100
LATENT_DIM = 25


In [3]:
input_texts = []
target_texts = []

with open('all.csv') as f:
    for line in f:
        line = line.rstrip()
        if not line:
            continue
        
        input_line = '<sos> ' + line
        target_line = line + ' <eos>'
        
        input_texts.append(input_line)
        target_texts.append(target_line)
    
    

In [4]:
# convert sentences into intger values
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE, filters = '') # Only the most common num_words-1 words will be kept
tokenizer.fit_on_texts(input_texts + target_texts) #Updates internal vocabulary 
input_sequences = tokenizer.texts_to_sequences(input_texts) #Transforms each text in texts to a sequence of integers
target_sequences = tokenizer.texts_to_sequences(target_texts)

In [5]:
# find max sequence length
max_sequence_length_from_data = max(len(seq) for seq in input_sequences)
max_sequence_length_from_data

71

In [6]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens' %len(word2idx))
assert('<sos>' in word2idx) # raise assertionError if <sos> is not in word2idx
assert('<eos>' in word2idx) #  raise assertionError if <eos> is not in word2idx

Found 18626 unique tokens


In [7]:
# pad sequences so that N x T matrix
max_sequence_length = min(max_sequence_length_from_data, MAX_SEQUENCE_LENGTH)
print(max_sequence_length)


71


In [8]:
input_sequences = pad_sequences(input_sequences, maxlen = max_sequence_length, padding = 'post') # transform the list of input_sequences into 2D numpy array
target_sequences = pad_sequences(target_sequences, maxlen = max_sequence_length, padding = 'post')
print('shape of data tensor: ',input_sequences.shape)

shape of data tensor:  (13659, 71)


In [32]:
# load pre-trained word vectors
word2vec = {}
with open('glove.6B/glove.6B.%sd.txt' %EMBEDDING_DIM, encoding = 'utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec =  np.asarray(values[1:], dtype = 'float32')
        word2vec[word] = vec

print('Found %s word2vec' %len(word2vec))




Found 400000 word2vec


In [10]:
# prepare embedding matrix
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) 
for word, i in word2idx.items():
    if i < MAX_VOCAB_SIZE:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be zero
            embedding_matrix[i] = embedding_vector
        


In [31]:
# one-hot the targets
one_hot_targets = np.zeros((len(input_sequences), max_sequence_length, num_words))
#one_hot_targets = {}
for i, target_sequence in enumerate(target_sequences):
    for t, word in enumerate(target_sequence):
        if word > 0:
            one_hot_targets[i, t, word] = 1


MemoryError: Unable to allocate 21.7 GiB for an array with shape (13659, 71, 3000) and data type float64

In [26]:
# load pre-trained word embeddings into an Embedding layer
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights = [embedding_matrix]
    
)

In [27]:
# building the model
# create Lstm network with single LSTM
input_ = Input(shape = (max_sequence_length, ))
initial_h = Input(shape = (LATENT_DIM, ))
initial_c = Input(shape = (LATENT_DIM, ))
x = embedding_layer(input_)
lstm = LSTM(LATENT_DIM, return_sequences = True, return_state = True)
x, _, _ = lstm(x, initial_state = [initial_h, initial_c])
dense = Dense(num_words, activation = 'softmax')
output = dense(x)
model = Model([input_, initial_h, initial_c], output)
model.compile(
    loss = 'categorical_crossentropy',
    optimizer = Adam(lr = 0.01),
    metrics = ['accuracy']
)


In [28]:
# training model
z = np.zeros((len(input_sequences), LATENT_DIM))
r = model.fit(
    [input_sequences, z, z],
    one_hot_targets,
    batch_size = BATCH_SIZE,
    epochs = EPOCHS,
    validation_split = VALIDATION_SPLIT,
)


ValueError: No data provided for "dense_3". Need data for each key in: ['dense_3']

In [None]:
model.save('model.h5')

In [None]:
model.load_weights('model.h5')

In [None]:
# plot some data



In [None]:
# make a sampling model
input2 = Input(shape = (1, )) # input one word at a time
x = embedding_layer(input2)
x, h, c = lstm(x, initial_state = [initial_h, initial_c])
output2 = dense(x)
sampling_model = Model([input2, initial_h, initial_c], [output2, h, c])

In [None]:
# reverse word2idx dictionary to get back words
# during prediction
idx2word = {v:k for k, v in word2idx.items()}

In [None]:
def sample_line():
    # initial inputs
    np_input = np.array([[word2idx['<sos>'] ]])
    h = np.zeros(shape = (1, LATENT_DIM))
    c = np.zeros(shape = (1, LATENT_DIM))
    
    eos = word2idx['<eos>'] # where to stop
    
    output_sentence = []
    
    for _ in range(max_sequence_length):
        o, h, c = sampling_model.predict([np_input, h, c])
        probs = o[0, 0]
        if np.argmax(probs) == 0:
            print('wtf')
        probs[0] = 0
        probs /= probs.sum()
        idx = np.random.choice(len(probs), p = probs)
        if idx == eos:
            break
            
        # acculate output
        output_sentence.append(idx2word.get(idx, '<WTF %s>' %idx ))


        # make the next input into model
        np_input[0, 0] = idx

    return ' '.join(output_sentence)

In [None]:
# generate poem of four lines
while True:
    for _ in range(4):
        print(sample_line())
        
    ans = input('---generate another? [Y/n]---')
    if ans and ans[0].lower().startswith('n'):
        break