# 🤴Anastasius

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.callbacks import ModelCheckpoint
from keras.preprocessing import text as Text

import numpy as np
import random
import sys
import math
import os

Using TensorFlow backend.


#  Declare  text source and constants
It is important to not read the entire file into memory.

The file is far too large to keep in memory.

Instead we read portions into memory, vectorize it, pass
that data into the lstm, and repeat until we process the entire file.

In [2]:
data_source = 'wikitext-103-raw/wiki.test.raw'

# Determine file size in bytes
file_size = os.path.getsize(data_source)

# Percentage of file to read into memory
data_slice_percentage = .1

# Total number of slices
total_slices = 1 / data_slice_percentage

# WTF AM I DOING!
data_slice_in_chars = math.ceil(file_size * data_slice_percentage) 

# How much of a sentence chunk to put into the LSTM
chunk_size = 50
skip = 2

# Create text generator
This method returns an ***iterator*** over the file that outputs a string of length **data_slice_size**. Every call to this method returns a different iterator starting from the beginning of the file.

The text returned is not processed, so we must vectorize the data, and run the data through the LSTM. 

**Note:** Mohammed said this chunk size should be just large enough to fit into the gpu, if the size is too large or too small then the application spends more time transfering the data from CPU -> GPU than actually processing it!

In [3]:
def text_iter():
    with open(data_source, encoding='utf8') as file:
        while True:
            chunk = list(file.read(data_slice_in_chars))
            if chunk:
                yield chunk
            else:
                return

# Determine $\sum$
We need to know all the possible characters in the file. Each of the characters is a class that the network can predict. We find $\sum$ before doing any heavy operations

In [4]:
# Slice number
slice_number = 1

vocab = set()
for data_slice in text_iter():
    print("Finding sigma of slice ", slice_number, ' of', total_slices)
    for letter in data_slice:
        vocab.add(letter)
    slice_number += 1

vocab = sorted(list(vocab))
print(vocab)

Finding sigma of slice  1  of 10.0
Finding sigma of slice  2  of 10.0
Finding sigma of slice  3  of 10.0
Finding sigma of slice  4  of 10.0
Finding sigma of slice  5  of 10.0
Finding sigma of slice  6  of 10.0
Finding sigma of slice  7  of 10.0
Finding sigma of slice  8  of 10.0
Finding sigma of slice  9  of 10.0
Finding sigma of slice  10  of 10.0
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', '¥', '©', '°', '½', 'Á', 'Æ', 'É', '×', 'ß', 'à', 'á', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'î', 'ñ', 'ó', 'ô', 'ö', 'ú', 'ü', 'ć', 'č', 'ě', 'ī', 'ł', 'Ō', 'ō', 'Š', 'ū', 'ž', 'ǐ', 'ǔ

# Vectorize $\sum$
We cannot directly pass characters into the LSTM, we must pass in numerical data. 

We map letter -> integer based on their **position** in $\sum$. We pass this integer into the LSTM which will output another integer in the range of $\sum$. We then map this integer -> letter for generating the next character in the sequence

We also create a one hot encoded version of each integer, that is suppose $|\sum| = n$ and $a \in \sum$ and encoder[a] = 1. Then the one hot encoding of a would be [0,1,0,.....(nth 0)]

Needs more explanation

In [5]:
# Enocder maps a letter -> integer
encoder = dict((letter,position) for position, letter in enumerate(vocab))

# Decoder maps an integer -> letter
decoder = dict((value,key) for key,value in encoder.items())

# Creates one hot encoding from 0.... vocab length
onehot = np.eye(len(vocab))

# Train LSTM
We read in slices from the file, vectorize the slice, form our training set, and pass this set into our LSTM

In [9]:
# Slice number
slice_number = 1

# Reads in a single slice per loop iteration
for data_slice in text_iter():
    # One hot the data slice (dont enumerate, too large to create tuples)
    print("Processing Slice ", slice_number, " of ", total_slices,'...', end='')
    i = 0
    slice_number += 1
    print('V',end='')
    for letter in data_slice:
        data_slice[i] = onehot[encoder[letter]]
        i += 1
    print(u'\u2713','...CI',end='')
    # Get the starting location of all text chunks
    chunk_indices = range(0, len(data_slice) - chunk_size, skip)
    
    print(u'\u2713','...X',end='')
    # Take chunk sized lengths from the data vector
    X = [data[ i : i + chunk_size] for i in chunk_indices]

    print(u'\u2713','...Y',end='')
    # Store the next letter after the chunk sized length
    Y = [data[ i + chunk_size] for i in chunk_indices]
    
    print(u'\u2713')
    # Data slice is now vectorized
    # Use it or lose it!

Vectorizing Slice  1  of  10.0 ...    ✓
Vectorizing Slice  2  of  10.0 ...    ✓
Vectorizing Slice  3  of  10.0 ...    ✓
Vectorizing Slice  4  of  10.0 ...    ✓
Vectorizing Slice  5  of  10.0 ...    ✓
Vectorizing Slice  6  of  10.0 ...    ✓
Vectorizing Slice  7  of  10.0 ...    ✓
Vectorizing Slice  8  of  10.0 ...    ✓
Vectorizing Slice  9  of  10.0 ...    ✓
Vectorizing Slice  10  of  10.0 ...    ✓


## Meta-Data

In [None]:
# The unique ascii chars within the data set
uniqueLetters = sorted(list(set(data)))
print(uniqueLetters)

# Enocder maps a letter -> integer
encoder = dict((letter,letterPosition) for letterPosition, letter in enumerate(uniqueLetters))

# Decoder maps an integer -> letter
decoder = dict((value,key) for key,value in encoder.items())

# Text chunk represents the amount of text to feed into the network
text_chunk_len = 50

# Skip represents how far we 'slide' down the text file per chunk
skip = 20

## Vectorize data

In [None]:
# One hot encode the unique letters
onehot = np.eye(len(uniqueLetters), dtype=bool)
print(onehot)

# Transform char -> int -> onehot
for i, letter in enumerate(data):
  data[i] = onehot[encoder[data[i]]]
  
# Get the starting location of all text chunks
chunk_indices = range(0, len(data_vector) - text_chunk_len, skip)

# Take chunk sized lengths from the data vector
X = [data[ i : i + text_chunk_len] for i in chunk_indices]

# Store the next letter after the chunk sized length
Y = [data[ i + text_chunk_len] for i in chunk_indices]

print("Finished vectorizing.")

## Build Model

In [None]:
#Build the model using Keras
model = Sequential()
model.add(LSTM(128, input_shape=(text_chunk_len, len(uniqueLetters))))
model.add(Dense(len(uniqueLetters)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

print("Finished building model.")

##  Train model

In [None]:
#Train the model

#Save weights to checkpoints file during training
chkpt_filepath = "/checkpoint/weights.best.hdf5"
checkpointer = keras.callbacks.ModelCheckpoint(chkpt_filepath,
                                               monitor='val_loss',
                                               verbose=0,
                                               save_best_only=False,
                                               save_weights_only=False,
                                               mode='auto',
                                               period=1)

for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, Y,
              batch_size=128,
              epochs=1,
              callbacks=[checkpointer]) #save model


    

## Test Model

In [None]:
# Load saved weights and recreate model
model = Sequential()
model.load_weights("/checkpoint/weights.best.hdf5")
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# get random seed sentence from raw_data
start_index = random.randint(0, len(raw_data) - text_chunk_len - 1)

print('\n----- diversity:', diversity)

generated = ''
sentence = text[start_index: start_index + text_chunk_len]
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
sys.stdout.write(generated)

for i in range(400):
  x = np.zeros((1, len_per_section, char_size))
  for t, char in enumerate(sentence):
    x[0, t, char_indices[char]] = 1.

    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, diversity)
    next_char = indices_char[next_index]

    generated += next_char
    sentence = sentence[1:] + next_char

    sys.stdout.write(next_char)
    sys.stdout.flush()
    print()