# Wiki text Generator

In [1]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
import nltk
import numpy as np
import random
import sys
import io

Using TensorFlow backend.


In [2]:
import numpy as np #vectorization
import random #generate probability distribution 
import tensorflow as tf #ml
import datetime #clock training time

In [3]:
from nltk.corpus import webtext 
nltk.download('webtext')

[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\69785hsh\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!


True

# Getting data

In [4]:
text = open('wiki.test.raw', encoding="utf8").read()

In [5]:
print('text length in number of characters:', len(text))

text length in number of characters: 1288556


# Map chars to integers

In [6]:
chars = sorted(list(set(text)))
print('total chars: ', len(chars))

total chars:  259


In [7]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# Split up into subsequences

In [8]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 429506


In [9]:
print(sentences[:3])
print(next_chars[:3])

[' \n = Robert Boulter = \n \n Robert Boulter', '= Robert Boulter = \n \n Robert Boulter is', 'obert Boulter = \n \n Robert Boulter is an']
[' ', ' ', ' ']


In [10]:
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [11]:
print(x[:3])
print(y[:3])

[[[False  True False ... False False False]
  [ True False False ... False False False]
  [False  True False ... False False False]
  ...
  [False False False ... False False False]
  [False False False ... False False False]
  [False False False ... False False False]]

 [[False False False ... False False False]
  [False  True False ... False False False]
  [False False False ... False False False]
  ...
  [False  True False ... False False False]
  [False False False ... False False False]
  [False False False ... False False False]]

 [[False False False ... False False False]
  [False False False ... False False False]
  [False False False ... False False False]
  ...
  [False  True False ... False False False]
  [False False False ... False False False]
  [False False False ... False False False]]]
[[False  True False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False Fals

# Building Model

In [12]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [13]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

### Helper Functions

In [14]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [15]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

### Defining callbacks

In [16]:
from keras.callbacks import ModelCheckpoint

filepath = "wiki.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')

In [17]:
from keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=1, min_lr=0.001)

In [18]:
callbacks = [print_callback, checkpoint, reduce_lr]

### training

In [19]:
model.fit(x, y, batch_size=128, epochs=5, callbacks=callbacks)

Instructions for updating:
Use tf.cast instead.
Epoch 1/5

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: " Jaxon Benge and original bassist Mark Y"
 Jaxon Benge and original bassist Mark Yout and and an the contract of the former to the controlled the strate of the state the state in the sens of the Roman and the Nero , and the state an an an an an an the strate of the contil state the since and the contralt and a state to be the controlled the controlled the strate to the compare to the side to the state to the since the first an an an the contralt and the second of the contralt t
----- diversity: 0.5
----- Generating with seed: " Jaxon Benge and original bassist Mark Y"
 Jaxon Benge and original bassist Mark Yout and in Charaction the straige an an the Merican state to intersifiely , they the first their intersing the central . The destrate on the reast of the Sout Helow , and an the are were not into the First an an the sent . 
 Wan , leavy in

 a handful of US navy monitors laid down and a head of the world commemorate to restrict construction . The the Triple , and force and fiction was armor in the this the second several district as the the was the early the the Manila and aother commemeral armoured by the development of the the the to the an per of the film to the character were the the entire @-@ in the sent show , were and a cyclones of the and film a provided mines , t
----- diversity: 1.0
----- Generating with seed: " a handful of US navy monitors laid down"
 a handful of US navy monitors laid down his ctusk back and odiadanitoent and spackians distrieg docthed theorgus to be total . = Brapical smality or . The distinguinarge and 9iltho , Ws It and greatly on skucker , 200 , sage of the Empara Varushly ( Maimhal Distore Oix  @-@ pragress by road , although the filter of the Lessan . sheawas at scream arpack . In the compledinion they and tous to a veated at the area said to programed 15 . S
----- diversity: 1.2
-----

  after removing the cwd from sys.path.


ided an the Philippines to the match in the formall and the said the Ball War Americal Battle . He was also even and the land , and the result to the concer . It sainer the supering to the South Athori , School in the present the were addition as a way in the British 1991 , the roles were 
----- diversity: 1.0
----- Generating with seed: "s and more flexible design , for instanc"
s and more flexible design , for instanced on High Tendron Fork and in San a textial . 
 = = = , Idaga ( 1916 ) , Plausil as through Select Chail were nembeted its caused the receme mound of the court portlock the pinder of Lick , Duter Rome munit Taft . 
 
 = = = War and a faved exittests Kchitona @-@ trmen . also wained acception di in massed vottovation and asrophing the strikes much were special rebelled dridgentuation of 64 @,@ 000
----- diversity: 1.2
----- Generating with seed: "s and more flexible design , for instanc"
s and more flexible design , for instance owned Georle on the ho memrate . broadtonn

<keras.callbacks.History at 0x1932c0daf28>

# Testing the model

In [20]:
def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    for i in range(length):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char
    return generated

In [25]:
print(generate_text(500, 0.2))

these smaller individuals were originally the strengthen , the strengthen , the strengthen of the season . The strengthen , and the strengthen of the season , the season of the the result in the considered the strengthen and the film , and the state and the season of the resulter and the season , and the the first the and the the the first and patrol and the the position of the strengthen and the the state and the and the season and the formed the season of the strengthen was the season . 
 = = = = Austrian , the period the considered


In [26]:
print(generate_text(500, 0.5))

a discipline and acquired status as a lead and the music don companine three super the region of the anding in the North Korean was moder and praised on the shopted the reselved the numbers . 
 = = = = Jack On several the those the war Seneca . The exposed the New York , Powner , and concluded to a minuted his since . 
 = = = Jack . The season and Line the ships , which and the transferred to the concussion also have an the most the Temple , Lesnar used a convised the film production and on the first transverted the head , which and h


In [27]:
print(generate_text(500, 1.0))

 from his post as generalissimo and appoinct throughout an awarded by Sumut , on all thick at Meynow @-@ bidgen were notiluted a poecle hiers of probleyse later and the Chi Division ( CA Miniles . The two Dublle acruded ectoners reclioned polical chorch of Poon 20 , he counked acrising the larg was exithed in Failers Ahang . 
 Althoorchhove , daid centure in The Seneca ( IS in sever " America Althoke Plan depressions whoth and stordrors of effenting valians ( . The turns of torction . Troops ays walls for Tre singan frotous the indica


In [28]:
print(generate_text(500, 1.2))

n , Queensland , on 7 May 1942 . They were re @-@ storrt are Gimbatures ) , peace gentre , Franc , Ark of Net km of Eyns war sudce , zon assiblal general themes bransporening strait of inumaride derible picotylong other tond on Dagago at 150hon stimating " Heon band hervisted " , 2013 to 6eOcham Bover ( half , 
 = = 
 Sirkel were valiss tiechyvension tym 114 UAG Nm Contempl Colminall Ronycraous Massin C tin winishion of the nivers . Thd by Chartoned on @-@ displasted teke afries in kilts units counatly , Soyrhiggestle Rdrk Elmydition 
