# Text generation

In [1]:
import github_command as gt

In [2]:
gt.push(file_to_transfer="TD7_Text_Generation_With_LSTM.ipynb",
       message="text generation",
       repos="TDs_ESILV.git")

## Load Packages

In [4]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [5]:
!pip install nltk



In [6]:
import nltk
from nltk.text import Text

In [7]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/lucbertin/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [8]:
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')

In [182]:
# load ascii text and covert to lowercase
#filename = "wonderland.txt"
#raw_text = open(filename, 'r', encoding='utf-8').read()
#raw_text = raw_text.lower()
raw_text = " ".join(alice).lower()

# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: {}".format(n_chars))
print("Total Vocab: {}".format(n_vocab))

# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    #print(seq_in)
    seq_out = raw_text[i + seq_length]
    #print(seq_out)
    dataX.append([char_to_int[char] for char in seq_in])
    #print(dataX)
    dataY.append(char_to_int[seq_out])
    #print(dataY)
    

n_patterns = len(dataX)
print("Total Patterns: {}".format(n_patterns))

### 
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# normalize
X = X / float(n_vocab)

# one hot encode the output variable
y = np_utils.to_categorical(dataY)

Total Characters: 150118
Total Vocab: 46
Total Patterns: 150018


### Define the LSTM model

In [183]:
X.shape, y.shape

((150018, 100, 1), (150018, 46))

In [20]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 256)               264192    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 46)                11822     
Total params: 276,014
Trainable params: 276,014
Non-trainable params: 0
_________________________________________________________________


### Define the checkpoint

In [13]:
X.shape

(150018, 100, 1)

In [33]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [35]:
hist = model.fit(X, y, epochs=1, batch_size=500, callbacks=[checkpoint])

Epoch 1/1
  5000/150018 [..............................] - ETA: 5:06 - loss: 2.8943

KeyboardInterrupt: 

In [31]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [32]:
import sys

In [189]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print( "\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" d concluded the banquet --] ' what is the use of repeating all that stuff ,' the mock turtle interru "
 , ' io the toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe toe t

Seed:
" looking at everything about her , to pass away the time . alice had never been in a court of justice "
 , ' io ' s '  said the cat . ' io ' s ' m ' t ' v aen ' t ' ve toe taad ,' shi hatter sard to herself , ' in ' s ' m ' t ' v aen ' t ' ve toe taad ,' said the cat . ' io ' s ' m ' t ' v aen ' t ' ve toe taad ,' shi hatter sard to herself , ' in ' s ' m ' t ' v aen ' t ' ve toe taad ,' said the cat . ' io ' s ' m ' t ' v aen ' t ' ve toe taad ,' shi hatter sard to herself , ' in ' s ' m ' t ' v aen ' t ' ve toe taad ,' said the cat . ' io ' s ' m ' t ' v aen ' t ' ve toe taad ,' shi hatter sard to herself , ' in ' s ' m ' t ' v aen ' t ' ve toe taad ,' said the cat . ' io ' s ' m ' t ' v aen ' t ' ve toe taad ,' shi hatter sard to herself , ' in ' s ' m ' t ' v aen ' t ' ve toe taad ,' said the cat . ' io ' s ' m ' t ' v aen ' t ' ve toe taad ,' shi hatter sard to herself , ' in ' s ' m ' t ' v aen ' t ' ve toe taad ,' said the cat . ' io ' s ' m ' t ' v aen ' t ' ve toe taad 

## Load Model

In [27]:
weights_file = "weights-improvement-01-2.4514.hdf5"

In [28]:
from keras.models import load_model

In [29]:
model = load_model(weights_file)

In [30]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 256)               264192    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 46)                11822     
Total params: 276,014
Trainable params: 276,014
Non-trainable params: 0
_________________________________________________________________


## Sampling from the Softmax

In [None]:
def sample_from_softmax(preds):
    import numpy as np
    preds = np.reshape(preds, -1)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [196]:
sample_from_softmax([0,0.2,0.8])

2

In [205]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print( "\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
    # take the sequence ( <=> pattern), reshape
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    # normalize
    x = x / float(n_vocab)
    # predict next character
    prediction = model.predict(x, verbose=0)
    # sample from softmax output for a little bit of variance
    index = sample_from_softmax(prediction)
    # transform index to char from dict
    result = int_to_char[index]
    # show back the pattern to the user
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    # entry sequence must have same lenght so drop the first character
    pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
" their slates , ' she doesn ' t believe there ' s an atom of meaning in it ,' but none of them attemp "
 . ' to faereis oneec . shue , aed ni lameey qo .' shid  ltec : ' tfu d tael ? why tant ,l eami , ro ihdeh ''aatelts oe ,  navdri blice : ' teacr jo oeeciest ,  'reld the cflpesu tone woe maah a''-wo mnsteadtg teon !'aou vreoonet a autdenan "' ihee thi ln lhuil ih opgx ti dnady . sou mtdoglt g '    wh d solh b oerr ihggy a- 'tdg mpeee , '   ' * * * ( * * * * * * * * *sd tout ih a'' niseee tfrlrsg c ''n souu ,'d noeh sflpeemy !  saad mhe goeghnt . ' ie   ie*tgia g'romeeec ch soon , t ! ahd ,  'abnd iiot afdeg io sou tey loonynl lone po hypt ; iheeee as cuos diacp soeh b'mifdcgu ohcsendl lo sopcdoo fhhtel ieitha oaop lo a ' ih nlee veu --y thee rfe yoa wis ,tt io maa luueoa toiiee ! 'si , cutkehl , and the aioy soo mien , 'oeiwe mamd haemm bi mhet areihef dnrloude-c . ' neud wh  tand ,   ' '   b * * vhn , ilt go vhir vro ('r wai toan seoy ho tolyey wn sidt ,  she taoh t eryc.ahl

## Beam Search 

In [202]:
prediction

array([[9.19839218e-02, 2.41464382e-04, 3.33781245e-05, 7.52547130e-05,
        6.78745637e-05, 1.55610578e-05, 1.09534645e-04, 4.16072464e-04,
        1.63882403e-04, 4.79996816e-05, 2.81081475e-06, 4.95848099e-06,
        4.67444715e-06, 3.57902263e-06, 2.63577749e-05, 2.62667854e-05,
        3.35156328e-05, 1.49173457e-05, 9.46370164e-06, 1.11620529e-05,
        4.68286984e-02, 3.21025820e-03, 3.85166146e-02, 8.32669660e-02,
        2.32710198e-01, 8.15854128e-03, 1.31016383e-02, 4.32418324e-02,
        3.38758342e-02, 1.89665428e-04, 7.29463622e-03, 1.36435134e-02,
        9.28971730e-03, 5.13990596e-02, 1.91199742e-02, 1.37465633e-02,
        2.27772645e-04, 2.84235794e-02, 3.73835564e-02, 1.27047941e-01,
        2.95352563e-02, 4.50058654e-03, 1.32524297e-02, 6.22327439e-04,
        4.79218252e-02, 1.98288282e-04]], dtype=float32)

In [332]:
best_k

array([23, 39,  0])

In [331]:
A = np.tile([2,32], (3,1))
np.c_[A, best_k]

array([[ 2, 32, 23],
       [ 2, 32, 39],
       [ 2, 32,  0]])

In [343]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print( "\"", ''.join([int_to_char[value] for value in pattern]), "\"")
main_sentence = pattern
k = 3
# generate characters
for i in range(1000):
    # take the sequence ( <=> pattern), reshape (batch=1, len seq, features)
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    # normalize
    x = x / float(n_vocab)
    
    # predictions for next character
    predictions = np.log(model.predict(x, verbose=0)).reshape(-1)
    print(predictions.shape)
    
    # takes k best
    best_k = np.argsort(predictions)[-k:]
    # and their respective score
    best_k_scores  = np.sort(predictions)[-k:]
        
    # k new sentences proposals  
    proposals = np.tile(pattern, (3,1))
    proposals = np.c_[proposals, best_k][:,1:]
    print(proposals)
    # predict all k sequences
    predictions = np.log(model.predict(proposals.reshape(k, len(pattern), 1), batch_size=k))
    # takes k best FOR EACH k SEQUENCE
    print(predictions)
    best_k_after = np.argsort(predictions)[:,-k:]
    best_k_scores_after = np.sort(predictions)[:,-k:]
    print(best_k_scores)
    # multiply with previous scores: 
    best_k_after
    
    #print(predictions)
    #best_k = [ np.argsort(prediction)[-k:] for prediction in predictions]
    print(best_k)
    break
    # update predicted seq with highest score
    #for seq, score in intermediate_results.items():
        
    # explore next iteration
    #for i_, score in intermediate_results.items():
        # show next proposals
        ## could be optimized using as a batch
    #print(proposals.shape)
    #print(proposals)
    #break
    #index = sample_from_softmax(prediction)
    # transform index to char from dict
    
    # show back the pattern to the user
    #seq_in = [int_to_char[value] for value in pattern]
    #sys.stdout.write(result)
    #pattern.append(index)
    # entry sequence must have same lenght so drop the first character
    #pattern = pattern[1:len(pattern)]
    
    #main_sentence.append()
print("\nDone.")

Seed:
" began bowing to the king , the queen , the royal children , and everybody else . ' leave off that !' "
(46,)
[[24 26 20 33  0 21 34 42 28 33 26  0 39 34  0 39 27 24  0 30 28 33 26  0
   7  0 39 27 24  0 36 40 24 24 33  0  7  0 39 27 24  0 37 34 44 20 31  0
  22 27 28 31 23 37 24 33  0  7  0 20 33 23  0 24 41 24 37 44 21 34 23 44
   0 24 31 38 24  0  9  0  3  0 31 24 20 41 24  0 34 25 25  0 39 27 20 39
   0  1  3 38]
 [24 26 20 33  0 21 34 42 28 33 26  0 39 34  0 39 27 24  0 30 28 33 26  0
   7  0 39 27 24  0 36 40 24 24 33  0  7  0 39 27 24  0 37 34 44 20 31  0
  22 27 28 31 23 37 24 33  0  7  0 20 33 23  0 24 41 24 37 44 21 34 23 44
   0 24 31 38 24  0  9  0  3  0 31 24 20 41 24  0 34 25 25  0 39 27 20 39
   0  1  3 39]
 [24 26 20 33  0 21 34 42 28 33 26  0 39 34  0 39 27 24  0 30 28 33 26  0
   7  0 39 27 24  0 36 40 24 24 33  0  7  0 39 27 24  0 37 34 44 20 31  0
  22 27 28 31 23 37 24 33  0  7  0 20 33 23  0 24 41 24 37 44 21 34 23 44
   0 24 31 38 24  0  9  0  3  0 31 24 2

In [None]:
def beam_search_decoder(sequence, k, model):
    # probabilities
    probs  = model.predict(x, verbose=0)
    # select k best
    best_k = np.argsort(probs)[-k:]
    # append to original sequence as different proposal new sequences
    proposals = [sequence + [elem] for elem in best_k]
    # append 
    # 
    #
    
    sequences = [[list(), 0.0]]
    # walk over each step in sequence
    for row in data:
        all_candidates = list()
        # expand each current candidate
        for i in range(len(sequences)):
            seq, score = sequences[i]
#             for j in range(len(row)): # instead of exploring all the labels, explore only k best at the current time
            
            # explore k best
            for j in best_k:
                candidate = [seq + [j], score + tf.math.log(row[j])]
                all_candidates.append(candidate)
        # order all candidates by score
        ordered = sorted(all_candidates, key=lambda tup:tup[1], reverse=True)
        # select k best
        sequences = ordered[:k]
    return sequences