In [1]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

# Chapter 13 Word Embeddings

## Load text, load glove, embed and create weight matrix

In [74]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from collections import Counter
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from numpy import asarray, zeros

In [75]:
url='https://en.wikipedia.org/wiki/Iron'
html=urlopen(url)
bs=BeautifulSoup(html, 'html.parser')
text=[]
for para in bs.find_all('p'):
    text.append(para.get_text())
    
text=[item.strip() for item in text]

iron=text[2:5]
print(iron)

["Iron (/ˈaɪərn/) is a chemical element with symbol Fe (from Latin: ferrum) and atomic number 26. It is a metal that belongs to the first transition series and group 8 of the periodic table. It is by mass the most common element on Earth, right in front of oxygen (32.1% and 30.1%, respectively), forming much of Earth's outer and inner core. It is the fourth most common element in the Earth's crust.", "In its metallic state, iron is rare in the Earth's crust, limited mainly to deposition by meteorites. Iron ores, by contrast, are among the most abundant in the Earth's crust, although extracting usable metal from them requires kilns or furnaces capable of reaching 1,500\xa0°C (2,730\xa0°F) or higher, about 500\xa0°C (900\xa0°F) higher than what is enough to smelt copper. Humans started to master that process in Eurasia only about 2000 BCE[not verified in body], and the use of iron tools and weapons began to displace copper alloys, in some regions, only around 1200 BCE. That event is cons

In [37]:
# just to view an example:
# from keras.preprocessing.text import one_hot
# vocab_size=50
# encoded_docs=[one_hot(d, vocab_size) for d in iron]
# print(encoded_docs)

[[42, 36, 3, 14, 9, 40, 29, 21, 11, 18, 37, 25, 44, 32, 4, 45, 38, 3, 14, 40, 6, 21, 34, 36, 8, 35, 42, 44, 32, 35, 7, 36, 40, 10, 38, 3, 44, 29, 36, 48, 19, 40, 27, 5, 19, 48, 11, 7, 46, 3, 11, 44, 43, 11, 31, 46, 32, 7, 12, 11, 44, 16, 49, 38, 3, 36, 27, 48, 19, 40, 48, 36, 12, 17], [48, 45, 39, 12, 42, 3, 15, 48, 36, 12, 17, 46, 6, 34, 3, 44, 15, 42, 12, 44, 31, 19, 16, 36, 48, 18, 48, 36, 12, 17, 48, 20, 5, 40, 18, 14, 4, 1, 26, 37, 11, 7, 16, 11, 16, 7, 6, 26, 45, 31, 16, 19, 45, 8, 15, 3, 46, 34, 7, 15, 22, 30, 34, 35, 6, 22, 48, 49, 5, 31, 5, 41, 6, 46, 48, 14, 44, 36, 21, 7, 42, 21, 44, 26, 40, 34, 44, 15, 14, 48, 18, 4, 5, 19, 8, 41, 6, 37, 3, 41, 36, 35, 18, 36, 3, 14, 34, 36, 42, 14, 48, 36, 17, 8, 42, 14, 48, 8, 27, 3, 48, 42, 44, 9, 24, 19, 44, 6, 36, 48, 19, 27, 9, 49, 7, 11, 18, 23, 44, 35, 2], [30, 44, 40, 45, 42, 24, 19, 32, 34, 35, 32, 23, 42, 47, 43, 29, 46, 44, 25, 34, 12, 22, 34, 2, 7, 42, 11, 20, 18, 8, 14, 32, 36, 11, 7, 18, 13, 9, 6, 10, 4, 4, 14, 14, 14, 42, 8,

In [117]:
#Tokenize sentences
t=Tokenizer()
t.fit_on_texts(iron)
vocab_size=len(t.word_index)+1

encoded_docs=t.texts_to_sequences(iron)
print(encoded_docs)

#Run a check on vocab size, sentence lengths:
sent_len=[]
tokens=[]
for x in iron:
    sent_len.append(len(x.split()))
    for word in x.split():
        tokens.append(word)

print('sentence lengths: ', sent_len)
print('vocab size: ', len(set(tokens)))

#bag of words just FYI
bow=Counter(tokens)

[[3, 40, 6, 20, 41, 12, 21, 42, 43, 13, 44, 45, 2, 46, 47, 48, 14, 6, 20, 15, 8, 49, 5, 1, 50, 22, 51, 2, 52, 53, 7, 1, 54, 55, 14, 6, 9, 56, 1, 10, 16, 12, 57, 58, 59, 4, 60, 7, 23, 61, 17, 2, 62, 17, 63, 64, 65, 7, 11, 66, 2, 67, 68, 14, 6, 1, 69, 10, 16, 12, 4, 1, 11, 18], [4, 70, 71, 72, 3, 6, 73, 4, 1, 11, 18, 74, 75, 5, 76, 9, 77, 3, 78, 9, 79, 19, 80, 1, 10, 81, 4, 1, 11, 18, 82, 83, 84, 15, 13, 85, 86, 87, 24, 88, 89, 7, 90, 17, 25, 91, 92, 24, 26, 27, 25, 93, 26, 28, 94, 6, 95, 5, 96, 29, 97, 98, 5, 99, 8, 100, 4, 101, 30, 27, 102, 31, 103, 104, 4, 105, 2, 1, 106, 7, 3, 107, 2, 108, 109, 5, 110, 29, 32, 4, 33, 111, 30, 112, 113, 31, 8, 114, 6, 115, 1, 22, 13, 1, 116, 34, 5, 1, 3, 34, 4, 1, 117, 118, 3, 32, 119, 35, 120, 121, 122, 3, 2, 123, 124, 19, 9, 125, 1, 10, 16, 126, 36, 127, 7, 128, 129, 130, 2, 131, 132], [133, 2, 134, 135, 3, 37, 19, 136, 137, 138, 139, 140, 3, 141, 142, 21, 23, 2, 143, 5, 144, 145, 5, 146, 147, 3, 38, 148, 149, 35, 39, 150, 1, 38, 7, 33, 151, 36, 8, 

In [77]:
max_length=80
padded_docs=pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[  3  40   6  20  41  12  21  42  43  13  44  45   2  46  47  48  14   6
   20  15   8  49   5   1  50  22  51   2  52  53   7   1  54  55  14   6
    9  56   1  10  16  12  57  58  59   4  60   7  23  61  17   2  62  17
   63  64  65   7  11  66   2  67  68  14   6   1  69  10  16  12   4   1
   11  18   0   0   0   0   0   0]
 [ 98   5  99   8 100   4 101  30  27 102  31 103 104   4 105   2   1 106
    7   3 107   2 108 109   5 110  29  32   4  33 111  30 112 113  31   8
  114   6 115   1  22  13   1 116  34   5   1   3  34   4   1 117 118   3
   32 119  35 120 121 122   3   2 123 124  19   9 125   1  10  16 126  36
  127   7 128 129 130   2 131 132]
 [133   2 134 135   3  37  19 136 137 138 139 140   3 141 142  21  23   2
  143   5 144 145   5 146 147   3  38 148 149  35  39 150   1  38   7  33
  151  36   8 152 153 154  39 155 156 157  28   1  15   2 158 159 160 161
  162  37 163 164   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0]]


In [78]:
#Load 50 dimensions glove vectors
embeddings_index=dict()
f=open('/home/meeka/Documents/Glove/vectors/glove6B/glove.6B.50d.txt')
for line in f:
    values=line.split()
    word=values[0]
    coefs=asarray(values[1:], dtype='float32')
    embeddings_index[word]=coefs
f.close()
print('Loaded %s word vectors. ' % len(embeddings_index))

Loaded 400000 word vectors. 


In [80]:
#Since we don't need that many vectors (above), filter for only words in training set:
embedding_matrix=zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector=embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

print(embedding_matrix)
print(embedding_matrix.shape)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.41800001  0.24968    -0.41242    ... -0.18411    -0.11514
  -0.78580999]
 [ 0.26818001  0.14346001 -0.27877    ... -0.63209999 -0.25027999
  -0.38097   ]
 ...
 [ 0.63427001  0.080184   -0.56338    ...  0.89538997  0.30136999
  -0.29934001]
 [ 0.15272     0.36181    -0.22168    ...  0.43382001 -0.084617
   0.1214    ]
 [ 0.00975    -0.34588     0.14711    ... -0.11084    -0.12771
  -0.032414  ]]
(165, 50)


# Chapter 18: Neural Language Modeling

In [10]:
from numpy import array
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.preprocessing.sequence import pad_sequences

In [3]:
filename='/home/meeka/Desktop/Research/reading/full_deep_learning_bundle_7books/deep_learning_for_nlp/rhyme.txt'

def load_doc(filename):
    file=open(filename, 'r')
    text=file.read()
    file.close()
    return text

raw_text=load_doc(filename)

print(raw_text)

#Encode into sequences
tokens=raw_text.split()
raw_text=' '.join(tokens)

length=10
sequences=list()
for i in range(length, len(raw_text)):
    seq=raw_text[i-length:i+1]
    sequences.append(seq)
print('Total Sequences: %d' % len(sequences))


#Save to disk
def save_doc(lines, filename):
    data='\n'.join(lines)
    file=open(filename, 'w')
    file.write(data)
    file.close()
    
out_filename='/home/meeka/Desktop/Research/reading/full_deep_learning_bundle_7books/deep_learning_for_nlp/char_sequences.txt'
save_doc(sequences, out_filename)

Sing a song of sixpence,
A pocket full of rye.
Four and twenty blackbirds,
Baked in a pie.
When the pie was opened
The birds began to sing;
Wasn ' t that a dainty dish,
To set before the king.
The king was in his counting house,
Counting out his money;
The queen was in the parlour,
Eating bread and honey.
The maid was in the garden,
Hanging out the clothes,
When down came a blackbird
And pecked off her nose.

Total Sequences: 401


In [4]:
in_filename=out_filename
raw_text=load_doc(in_filename)
lines=raw_text.split('\n')

In [5]:
#encode sequences as integers
chars=sorted(list(set(raw_text)))
mapping=dict((c,i) for i,c in enumerate(chars))
print(mapping)

#process sequences
sequences=list()
for line in lines:
    encoded_seq=[mapping[char] for char in line]
    sequences.append(encoded_seq)

vocab_size=len(mapping)
print('Vocabulary Size: %d' % vocab_size)

#create input/output distinction & one hot encode
sequences=array(sequences)
X,y=sequences[:,:-1], sequences[:, -1]

sequences=[to_categorical(x, num_classes=vocab_size) for x in X]
X=array(sequences)
y=to_categorical(y, num_classes=vocab_size)

{'\n': 0, ' ': 1, "'": 2, ',': 3, '.': 4, ';': 5, 'A': 6, 'B': 7, 'C': 8, 'E': 9, 'F': 10, 'H': 11, 'S': 12, 'T': 13, 'W': 14, 'a': 15, 'b': 16, 'c': 17, 'd': 18, 'e': 19, 'f': 20, 'g': 21, 'h': 22, 'i': 23, 'k': 24, 'l': 25, 'm': 26, 'n': 27, 'o': 28, 'p': 29, 'q': 30, 'r': 31, 's': 32, 't': 33, 'u': 34, 'w': 35, 'x': 36, 'y': 37}
Vocabulary Size: 38


In [6]:
#input layer takes sequences with 10 time steps and 38 features

def define_model(X):
    model=Sequential()
    model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [7]:
model=define_model(X)
model.fit(X, y, epochs=100, verbose=2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 75)                34200     
_________________________________________________________________
dense (Dense)                (None, 38)                2888      
Total params: 37,088
Trainable params: 37,088
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
13/13 - 0s - loss: 3.6037 - accuracy: 0.1272
Epoch 2/100
13/13 - 0s - loss: 3.4659 - accuracy: 0.1945
Epoch 3/100
13/13 - 0s - loss: 3.1605 - accuracy: 0.1945
Epoch 4/100
13/13 - 0s - loss: 3.0425 - accuracy: 0.1945
Epoch 5/100
13/13 - 0s - loss: 3.0104 - accuracy: 0.1945
Epoch 6/100
13/13 - 0s - loss: 2.9890 - accuracy: 0.1945
Epoch 7/100
13/13 - 0s - loss: 2.9750 - accuracy: 0.1945
Epoch 8/100
13/13 - 0s - loss: 2.9637 - accuracy: 0.1945
Epoch 9/100
13/13 - 0s - loss: 2.9510 - accuracy: 0.1945
E

<tensorflow.python.keras.callbacks.History at 0x7f4fd00ff490>

In [None]:
#Option to save model, mappings:
from pickle import dump
model.save('model.h5')
dump(mapping, open('mapping.pkl', 'wb'))

## Generate text 

In [11]:
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text=seed_text
    for _ in range(n_chars):
        encoded=[mapping[char] for char in in_text]
        encoded=pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        encoded=to_categorical(encoded, num_classes=len(mapping))
        yhat=model.predict_classes(encoded, verbose=0)
        out_char=''
        for char, index in mapping.items():
            if index == yhat:
                out_char=char
                break
        in_text+=out_char
    return in_text

print(generate_seq(model, mapping, 10, 'Sing a son', 20))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Sing a song of sixpence, A poc


In [13]:
print(generate_seq(model, mapping, 10, 'if  was king ', 20))

if  was king i p ing cfperi off  


# Chapter 19: Word Based Neural Language Model

In [17]:
#Option 1 read two words (One in one out)

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

def generate_seq(model, tokenizer, seed_text, n_words):
    in_text, result=seed_text, seed_text
    for _ in range(n_words):
        encoded=tokenizer.texts_to_sequences([in_text])[0]
        encoded=array(encoded)
        yhat=model.predict_classes(encoded, verbose=0)
        out_word=''
        for word, index in tokenizer.word_index.items():
            if index==yhat:
                out_word=word
                break
        in_text, result=out_word, result + ' ' + out_word
    return result

def define_model(vocab_size):
    model=Sequential()
    model.add(Embedding(vocab_size, 10, input_length=1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

data="""Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n"""

tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
encoded=tokenizer.texts_to_sequences([data])[0]
vocab_size=len(tokenizer.word_index)+1
print('Vocabulary size: %d' % vocab_size)

sequences=list()
for i in range(1, len(encoded)):
    sequence=encoded[i-1:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
sequences=array(sequences)

X,y=sequences[:,0], sequences[:,1]
y=to_categorical(y, num_classes=vocab_size)

model=define_model(vocab_size)
model.fit(X,y, epochs=500, verbose=2)
print(generate_seq(model, tokenizer, 'Jack', 6))

Vocabulary size: 22
Total Sequences: 24
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             220       
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_2 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
1/1 - 0s - loss: 3.0911 - accuracy: 0.0417
Epoch 2/500
1/1 - 0s - loss: 3.0904 - accuracy: 0.0833
Epoch 3/500
1/1 - 0s - loss: 3.0896 - accuracy: 0.0833
Epoch 4/500
1/1 - 0s - loss: 3.0888 - accuracy: 0.1250
Epoch 5/500
1/1 - 0s - loss: 3.0881 - accuracy: 0.1250
Epoch 6/500
1/1 - 0s - loss: 3.0873 - accuracy: 0.1250
Epoch 7/50

Epoch 133/500
1/1 - 0s - loss: 2.4362 - accuracy: 0.2083
Epoch 134/500
1/1 - 0s - loss: 2.4223 - accuracy: 0.2083
Epoch 135/500
1/1 - 0s - loss: 2.4083 - accuracy: 0.2083
Epoch 136/500
1/1 - 0s - loss: 2.3942 - accuracy: 0.2083
Epoch 137/500
1/1 - 0s - loss: 2.3799 - accuracy: 0.2083
Epoch 138/500
1/1 - 0s - loss: 2.3655 - accuracy: 0.2083
Epoch 139/500
1/1 - 0s - loss: 2.3509 - accuracy: 0.2083
Epoch 140/500
1/1 - 0s - loss: 2.3363 - accuracy: 0.2083
Epoch 141/500
1/1 - 0s - loss: 2.3216 - accuracy: 0.2083
Epoch 142/500
1/1 - 0s - loss: 2.3067 - accuracy: 0.2083
Epoch 143/500
1/1 - 0s - loss: 2.2918 - accuracy: 0.2083
Epoch 144/500
1/1 - 0s - loss: 2.2768 - accuracy: 0.2083
Epoch 145/500
1/1 - 0s - loss: 2.2617 - accuracy: 0.2917
Epoch 146/500
1/1 - 0s - loss: 2.2465 - accuracy: 0.2917
Epoch 147/500
1/1 - 0s - loss: 2.2312 - accuracy: 0.2917
Epoch 148/500
1/1 - 0s - loss: 2.2159 - accuracy: 0.2917
Epoch 149/500
1/1 - 0s - loss: 2.2005 - accuracy: 0.3333
Epoch 150/500
1/1 - 0s - loss: 

Epoch 277/500
1/1 - 0s - loss: 0.6094 - accuracy: 0.8750
Epoch 278/500
1/1 - 0s - loss: 0.6031 - accuracy: 0.8750
Epoch 279/500
1/1 - 0s - loss: 0.5968 - accuracy: 0.8750
Epoch 280/500
1/1 - 0s - loss: 0.5907 - accuracy: 0.8750
Epoch 281/500
1/1 - 0s - loss: 0.5847 - accuracy: 0.8750
Epoch 282/500
1/1 - 0s - loss: 0.5787 - accuracy: 0.8750
Epoch 283/500
1/1 - 0s - loss: 0.5729 - accuracy: 0.8750
Epoch 284/500
1/1 - 0s - loss: 0.5671 - accuracy: 0.8750
Epoch 285/500
1/1 - 0s - loss: 0.5615 - accuracy: 0.8750
Epoch 286/500
1/1 - 0s - loss: 0.5559 - accuracy: 0.8750
Epoch 287/500
1/1 - 0s - loss: 0.5504 - accuracy: 0.8750
Epoch 288/500
1/1 - 0s - loss: 0.5450 - accuracy: 0.8750
Epoch 289/500
1/1 - 0s - loss: 0.5397 - accuracy: 0.8750
Epoch 290/500
1/1 - 0s - loss: 0.5345 - accuracy: 0.8750
Epoch 291/500
1/1 - 0s - loss: 0.5294 - accuracy: 0.8750
Epoch 292/500
1/1 - 0s - loss: 0.5244 - accuracy: 0.8750
Epoch 293/500
1/1 - 0s - loss: 0.5194 - accuracy: 0.8750
Epoch 294/500
1/1 - 0s - loss: 

Epoch 421/500
1/1 - 0s - loss: 0.2662 - accuracy: 0.8750
Epoch 422/500
1/1 - 0s - loss: 0.2656 - accuracy: 0.8750
Epoch 423/500
1/1 - 0s - loss: 0.2650 - accuracy: 0.8750
Epoch 424/500
1/1 - 0s - loss: 0.2644 - accuracy: 0.8750
Epoch 425/500
1/1 - 0s - loss: 0.2638 - accuracy: 0.8750
Epoch 426/500
1/1 - 0s - loss: 0.2632 - accuracy: 0.8750
Epoch 427/500
1/1 - 0s - loss: 0.2627 - accuracy: 0.8750
Epoch 428/500
1/1 - 0s - loss: 0.2621 - accuracy: 0.8750
Epoch 429/500
1/1 - 0s - loss: 0.2615 - accuracy: 0.8750
Epoch 430/500
1/1 - 0s - loss: 0.2610 - accuracy: 0.8750
Epoch 431/500
1/1 - 0s - loss: 0.2604 - accuracy: 0.8750
Epoch 432/500
1/1 - 0s - loss: 0.2599 - accuracy: 0.8750
Epoch 433/500
1/1 - 0s - loss: 0.2594 - accuracy: 0.8750
Epoch 434/500
1/1 - 0s - loss: 0.2589 - accuracy: 0.8750
Epoch 435/500
1/1 - 0s - loss: 0.2583 - accuracy: 0.8750
Epoch 436/500
1/1 - 0s - loss: 0.2578 - accuracy: 0.8750
Epoch 437/500
1/1 - 0s - loss: 0.2573 - accuracy: 0.8750
Epoch 438/500
1/1 - 0s - loss: 

In [20]:
#Option 2, build sequences line by line (grow sentence one word at a time)

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding


def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text=seed_text
    for _ in range(n_words):
        encoded=tokenizer.texts_to_sequences([in_text])[0]
        encoded=pad_sequences([encoded], maxlen=max_length, padding='pre')
        yhat=model.predict_classes(encoded, verbose=0)
        out_word=''
        for word, index in tokenizer.word_index.items():
            if index==yhat:
                out_word=word
                break
        in_text+= ' ' + out_word
    return in_text

def define_model(vocab_size, max_length):
    model=Sequential()
    model.add(Embedding(vocab_size, 10, input_length=max_length-1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

data="""Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n"""

tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
vocab_size=len(tokenizer.word_index)+1
print('Vocabulary size: %d' % vocab_size)

#create line based sequences:
sequences=list()
for line in data.split('\n'):
    encoded=tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence=encoded[:i+1]
        sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

max_length=max([len(seq) for seq in sequences])
sequences=pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
sequences=array(sequences)

X,y=sequences[:,:-1], sequences[:,-1]
y=to_categorical(y, num_classes=vocab_size)

model=define_model(vocab_size, max_length)
model.fit(X,y, epochs=500, verbose=2)

print(generate_seq(model, tokenizer, max_length-1, 'Jack', 4))
print(generate_seq(model, tokenizer, max_length-1, 'Jill', 4))

Vocabulary size: 22
Total Sequences: 21
Max Sequence Length: 7
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 6, 10)             220       
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_5 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
1/1 - 0s - loss: 3.0889 - accuracy: 0.0952
Epoch 2/500
1/1 - 0s - loss: 3.0872 - accuracy: 0.1429
Epoch 3/500
1/1 - 0s - loss: 3.0856 - accuracy: 0.1429
Epoch 4/500
1/1 - 0s - loss: 3.0838 - accuracy: 0.1429
Epoch 5/500
1/1 - 0s - loss: 3.0821 - accuracy: 0.1429
Epoch 6/500
1/1 - 0s - loss: 3.0802 - accu

Epoch 133/500
1/1 - 0s - loss: 1.1333 - accuracy: 0.7619
Epoch 134/500
1/1 - 0s - loss: 1.1225 - accuracy: 0.7619
Epoch 135/500
1/1 - 0s - loss: 1.1104 - accuracy: 0.7619
Epoch 136/500
1/1 - 0s - loss: 1.1014 - accuracy: 0.7619
Epoch 137/500
1/1 - 0s - loss: 1.0908 - accuracy: 0.8095
Epoch 138/500
1/1 - 0s - loss: 1.0789 - accuracy: 0.7619
Epoch 139/500
1/1 - 0s - loss: 1.0697 - accuracy: 0.7619
Epoch 140/500
1/1 - 0s - loss: 1.0594 - accuracy: 0.8095
Epoch 141/500
1/1 - 0s - loss: 1.0476 - accuracy: 0.7619
Epoch 142/500
1/1 - 0s - loss: 1.0382 - accuracy: 0.7619
Epoch 143/500
1/1 - 0s - loss: 1.0283 - accuracy: 0.8095
Epoch 144/500
1/1 - 0s - loss: 1.0167 - accuracy: 0.7619
Epoch 145/500
1/1 - 0s - loss: 1.0072 - accuracy: 0.7619
Epoch 146/500
1/1 - 0s - loss: 0.9976 - accuracy: 0.8095
Epoch 147/500
1/1 - 0s - loss: 0.9864 - accuracy: 0.8095
Epoch 148/500
1/1 - 0s - loss: 0.9767 - accuracy: 0.8095
Epoch 149/500
1/1 - 0s - loss: 0.9674 - accuracy: 0.8095
Epoch 150/500
1/1 - 0s - loss: 

Epoch 277/500
1/1 - 0s - loss: 0.3221 - accuracy: 0.9524
Epoch 278/500
1/1 - 0s - loss: 0.3195 - accuracy: 0.9524
Epoch 279/500
1/1 - 0s - loss: 0.3170 - accuracy: 0.9524
Epoch 280/500
1/1 - 0s - loss: 0.3145 - accuracy: 0.9524
Epoch 281/500
1/1 - 0s - loss: 0.3121 - accuracy: 0.9524
Epoch 282/500
1/1 - 0s - loss: 0.3096 - accuracy: 0.9524
Epoch 283/500
1/1 - 0s - loss: 0.3072 - accuracy: 0.9524
Epoch 284/500
1/1 - 0s - loss: 0.3048 - accuracy: 0.9524
Epoch 285/500
1/1 - 0s - loss: 0.3025 - accuracy: 0.9524
Epoch 286/500
1/1 - 0s - loss: 0.3001 - accuracy: 0.9524
Epoch 287/500
1/1 - 0s - loss: 0.2978 - accuracy: 0.9524
Epoch 288/500
1/1 - 0s - loss: 0.2955 - accuracy: 0.9524
Epoch 289/500
1/1 - 0s - loss: 0.2932 - accuracy: 0.9524
Epoch 290/500
1/1 - 0s - loss: 0.2909 - accuracy: 0.9524
Epoch 291/500
1/1 - 0s - loss: 0.2887 - accuracy: 0.9524
Epoch 292/500
1/1 - 0s - loss: 0.2865 - accuracy: 0.9524
Epoch 293/500
1/1 - 0s - loss: 0.2842 - accuracy: 0.9524
Epoch 294/500
1/1 - 0s - loss: 

Epoch 421/500
1/1 - 0s - loss: 0.1305 - accuracy: 0.9524
Epoch 422/500
1/1 - 0s - loss: 0.1300 - accuracy: 0.9524
Epoch 423/500
1/1 - 0s - loss: 0.1295 - accuracy: 0.9524
Epoch 424/500
1/1 - 0s - loss: 0.1290 - accuracy: 0.9524
Epoch 425/500
1/1 - 0s - loss: 0.1285 - accuracy: 0.9524
Epoch 426/500
1/1 - 0s - loss: 0.1280 - accuracy: 0.9524
Epoch 427/500
1/1 - 0s - loss: 0.1275 - accuracy: 0.9524
Epoch 428/500
1/1 - 0s - loss: 0.1270 - accuracy: 0.9524
Epoch 429/500
1/1 - 0s - loss: 0.1266 - accuracy: 0.9524
Epoch 430/500
1/1 - 0s - loss: 0.1261 - accuracy: 0.9524
Epoch 431/500
1/1 - 0s - loss: 0.1256 - accuracy: 0.9524
Epoch 432/500
1/1 - 0s - loss: 0.1252 - accuracy: 0.9524
Epoch 433/500
1/1 - 0s - loss: 0.1247 - accuracy: 0.9524
Epoch 434/500
1/1 - 0s - loss: 0.1242 - accuracy: 0.9524
Epoch 435/500
1/1 - 0s - loss: 0.1238 - accuracy: 0.9524
Epoch 436/500
1/1 - 0s - loss: 0.1234 - accuracy: 0.9524
Epoch 437/500
1/1 - 0s - loss: 0.1229 - accuracy: 0.9524
Epoch 438/500
1/1 - 0s - loss: 

In [21]:
#Option 3 read three words (two in one out)

from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text=seed_text
    for _ in range(n_words):
        encoded=tokenizer.texts_to_sequences([in_text])[0]
        encoded=pad_sequences([encoded], maxlen=max_length, padding='pre')
        yhat=model.predict_classes(encoded, verbose=0)
        out_word=''
        for word, index in tokenizer.word_index.items():
            if index==yhat:
                out_word=word
                break
        in_text += ' ' + out_word
    return in_text

def define_model(vocab_size, max_length):
    model=Sequential()
    model.add(Embedding(vocab_size, 10, input_length=max_length-1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

data="""Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n"""

tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
encoded=tokenizer.texts_to_sequences([data])[0]
vocab_size=len(tokenizer.word_index)+1
print('Vocabulary size: %d' % vocab_size)

sequences=list()
for i in range(2, len(encoded)):
    sequence=encoded[i-2:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

max_length=max([len(seq) for seq in sequences])
sequences=pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)
sequences=array(sequences)

X,y=sequences[:,:-1], sequences[:,-1]
y=to_categorical(y, num_classes=vocab_size)

model=define_model(vocab_size, max_length)
model.fit(X,y, epochs=500, verbose=2)

print(generate_seq(model, tokenizer, max_length-1, 'Jack and', 5))
print(generate_seq(model, tokenizer, max_length-1, 'And Jill', 3))
print(generate_seq(model, tokenizer, max_length-1, 'fell down', 5))
print(generate_seq(model, tokenizer, max_length-1, 'pail of', 5))

Vocabulary size: 22
Total Sequences: 23
Max Sequence Length: 3
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 2, 10)             220       
_________________________________________________________________
lstm_6 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_6 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
Epoch 1/500
1/1 - 0s - loss: 3.0913 - accuracy: 0.0435
Epoch 2/500
1/1 - 0s - loss: 3.0904 - accuracy: 0.1304
Epoch 3/500
1/1 - 0s - loss: 3.0895 - accuracy: 0.1304
Epoch 4/500
1/1 - 0s - loss: 3.0886 - accuracy: 0.0870
Epoch 5/500
1/1 - 0s - loss: 3.0876 - accuracy: 0.0870
Epoch 6/500
1/1 - 0s - loss: 3.0867 - accu

Epoch 133/500
1/1 - 0s - loss: 1.7074 - accuracy: 0.6087
Epoch 134/500
1/1 - 0s - loss: 1.6854 - accuracy: 0.6087
Epoch 135/500
1/1 - 0s - loss: 1.6634 - accuracy: 0.6087
Epoch 136/500
1/1 - 0s - loss: 1.6414 - accuracy: 0.6087
Epoch 137/500
1/1 - 0s - loss: 1.6194 - accuracy: 0.6087
Epoch 138/500
1/1 - 0s - loss: 1.5975 - accuracy: 0.6087
Epoch 139/500
1/1 - 0s - loss: 1.5756 - accuracy: 0.6522
Epoch 140/500
1/1 - 0s - loss: 1.5538 - accuracy: 0.6522
Epoch 141/500
1/1 - 0s - loss: 1.5319 - accuracy: 0.6522
Epoch 142/500
1/1 - 0s - loss: 1.5101 - accuracy: 0.6957
Epoch 143/500
1/1 - 0s - loss: 1.4883 - accuracy: 0.6957
Epoch 144/500
1/1 - 0s - loss: 1.4665 - accuracy: 0.6957
Epoch 145/500
1/1 - 0s - loss: 1.4448 - accuracy: 0.6957
Epoch 146/500
1/1 - 0s - loss: 1.4230 - accuracy: 0.7391
Epoch 147/500
1/1 - 0s - loss: 1.4013 - accuracy: 0.8261
Epoch 148/500
1/1 - 0s - loss: 1.3796 - accuracy: 0.8261
Epoch 149/500
1/1 - 0s - loss: 1.3580 - accuracy: 0.8261
Epoch 150/500
1/1 - 0s - loss: 

Epoch 277/500
1/1 - 0s - loss: 0.1344 - accuracy: 0.9565
Epoch 278/500
1/1 - 0s - loss: 0.1332 - accuracy: 0.9565
Epoch 279/500
1/1 - 0s - loss: 0.1321 - accuracy: 0.9565
Epoch 280/500
1/1 - 0s - loss: 0.1310 - accuracy: 0.9565
Epoch 281/500
1/1 - 0s - loss: 0.1299 - accuracy: 0.9565
Epoch 282/500
1/1 - 0s - loss: 0.1289 - accuracy: 0.9565
Epoch 283/500
1/1 - 0s - loss: 0.1279 - accuracy: 0.9565
Epoch 284/500
1/1 - 0s - loss: 0.1269 - accuracy: 0.9565
Epoch 285/500
1/1 - 0s - loss: 0.1259 - accuracy: 0.9565
Epoch 286/500
1/1 - 0s - loss: 0.1250 - accuracy: 0.9565
Epoch 287/500
1/1 - 0s - loss: 0.1240 - accuracy: 0.9565
Epoch 288/500
1/1 - 0s - loss: 0.1231 - accuracy: 0.9565
Epoch 289/500
1/1 - 0s - loss: 0.1222 - accuracy: 0.9565
Epoch 290/500
1/1 - 0s - loss: 0.1214 - accuracy: 0.9565
Epoch 291/500
1/1 - 0s - loss: 0.1205 - accuracy: 0.9565
Epoch 292/500
1/1 - 0s - loss: 0.1197 - accuracy: 0.9565
Epoch 293/500
1/1 - 0s - loss: 0.1189 - accuracy: 0.9565
Epoch 294/500
1/1 - 0s - loss: 

Epoch 421/500
1/1 - 0s - loss: 0.0787 - accuracy: 0.9565
Epoch 422/500
1/1 - 0s - loss: 0.0786 - accuracy: 0.9565
Epoch 423/500
1/1 - 0s - loss: 0.0785 - accuracy: 0.9565
Epoch 424/500
1/1 - 0s - loss: 0.0783 - accuracy: 0.9565
Epoch 425/500
1/1 - 0s - loss: 0.0782 - accuracy: 0.9565
Epoch 426/500
1/1 - 0s - loss: 0.0781 - accuracy: 0.9565
Epoch 427/500
1/1 - 0s - loss: 0.0780 - accuracy: 0.9565
Epoch 428/500
1/1 - 0s - loss: 0.0779 - accuracy: 0.9565
Epoch 429/500
1/1 - 0s - loss: 0.0778 - accuracy: 0.9565
Epoch 430/500
1/1 - 0s - loss: 0.0777 - accuracy: 0.9565
Epoch 431/500
1/1 - 0s - loss: 0.0776 - accuracy: 0.9565
Epoch 432/500
1/1 - 0s - loss: 0.0775 - accuracy: 0.9565
Epoch 433/500
1/1 - 0s - loss: 0.0774 - accuracy: 0.9565
Epoch 434/500
1/1 - 0s - loss: 0.0773 - accuracy: 0.9565
Epoch 435/500
1/1 - 0s - loss: 0.0772 - accuracy: 0.9565
Epoch 436/500
1/1 - 0s - loss: 0.0771 - accuracy: 0.9565
Epoch 437/500
1/1 - 0s - loss: 0.0770 - accuracy: 0.9565
Epoch 438/500
1/1 - 0s - loss: 

# Chapter 20: Neural Language Model for Text Generation

In [49]:
import re
import string
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from pickle import dump

In [25]:
import urllib.request
urllib.request.urlretrieve('http://www.gutenberg.org/cache/epub/1497/pg1497.txt', 'republic_clean.txt')

('republic_clean.txt', <http.client.HTTPMessage at 0x7f4f4416ca50>)

In [40]:
def load_doc(filename):
    file=open(filename, 'r')
    text=file.read()
    file.close()
    return text

in_filename='republic_clean.txt'
doc=load_doc(in_filename)
print(doc[:200])

﻿BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in wha


In [41]:
def clean_doc(doc):
    doc=doc.replace('--', ' ')
    tokens=doc.split()
    re_punc=re.compile('[%s]' % re.escape(string.punctuation))
    tokens=[re_punc.sub('',w) for w in tokens]
    tokens=[word for word in tokens if word.isalpha()]
    tokens=[word.lower() for word in tokens]
    return tokens

tokens=clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'run', 'and', 'bid', 'us', '

In [45]:
#sequence peparation in order to iterate over 50 words to predict next 1 word:

length=50+1
sequences=list()
for i in range(length, len(tokens)):
    seq=tokens[i-length:i]
    line=' '.join(seq)
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

def save_doc(lines, filename):
    data='\n'.join(lines)
    file=open(filename, 'w')
    file.write(data)
    file.close()
    
out_filename='republic_sequences.txt'
save_doc(sequences,out_filename)

Total Sequences: 118632


## Begin final data prep, model phase:

In [52]:
in_filename='republic_sequences.txt'
doc=load_doc(in_filename)
lines=doc.split('\n')


def define_model(vocab_size, seq_length):
    model=Sequential()
    model.add(Embedding(vocab_size, 50, input_length=seq_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model


tokenizer=Tokenizer()
tokenizer.fit_on_texts(lines)
sequences=tokenizer.texts_to_sequences(lines)
vocab_size=len(tokenizer.word_index)+1

sequences=array(sequences)
X,y=sequences[:,:-1], sequences[:,-1]
y=to_categorical(y, num_classes=vocab_size)
seq_length=X.shape[1]

In [53]:
model=define_model(vocab_size, seq_length)
model.fit(X,y, batch_size=128, epochs=30)
model.save('Republicmodel.h5')
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 50, 50)            370500    
_________________________________________________________________
lstm_9 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_9 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_10 (Dense)             (None, 7410)              748410    
Total params: 1,269,810
Trainable params: 1,269,810
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30

## Generate Text from model:

In [54]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [55]:
def load_doc(filename):
    file=open(filename, 'r')
    text=file.read()
    file.close()
    return text

def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result=list()
    in_text=seed_text
    for _ in range(n_words):
        encoded=tokenizer.texts_to_sequences([in_text])[0]
        encoded=pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        yhat=model.predict_classes(encoded, verbose=0)
        out_word=''
        for word, index in tokenizer.word_index.items():
            if index==yhat:
                out_word=word
                break
        in_text+=' '+out_word
        result.append(out_word)
    return ' '.join(result)
    
in_filename='republic_sequences.txt'
doc=load_doc(in_filename)
lines=doc.split('\n')
seq_length=len(lines[0].split())-1
model=load_model('Republicmodel.h5')
tokenizer=load(open('tokenizer.pkl', 'rb'))

#Select seed to start prediction:
seed_text=lines[randint(0,len(lines))]
print(seed_text+'\n')
generated=generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

of the pilot may be improved by a sea voyage you would not be inclined to say would you that navigation is the art of medicine at least if we are to adopt your exact use of language certainly not or because a man is in good health when he receives

reverential one who is the best of the soul and the other and the other is the reverse of the soul and the other is the reverse of the soul and the other is the reverse of the soul and the other is the reverse of the soul and the
