# 🤴Anastasius

In [None]:
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.callbacks import ModelCheckpoint
from keras.preprocessing import text as Text
from keras.layers import Dropout

import numpy as np
import random
import sys
import math
import os
import pandas as pd
import h5py

#  Declare  text source and constants
It is important to not read the entire file into memory.

The file is far too large to keep in memory.

Instead we read portions into memory, vectorize it, pass
that data into the lstm, and repeat until we process the entire file.

In [None]:
# Input sources
data_source = 'wikitext-103-raw/wiki.train.vector'
vocab_path = 'wikitext-103-raw/wiki.train.vocab'
decoder_path = 'wikitext-103-raw/wiki.train.decoder.npy'
data_len_path = 'wikitext-103-raw/wiki.train.vector.len'


# How many rows to read in from the csv
chunk_size = 100000

# How many times to run the training
numRuns = 10

# Load Processed data
We need to know all the possible characters in the file. Each of the characters is a class that the network can predict. We find $\sum$ before doing any heavy operations

In [None]:
# Load Vocab
with open(vocab_path,"r") as file:
    vocab = sorted(list(file.read()))

# Load Decoder
decoder = np.load(decoder_path).item()

# Build Encoder
encoder = dict((value,key) for key,value in decoder.items())


# Load CSV Row count
with open(data_len_path,"r") as file:
    data_row_count = int(file.read())
    
print("VOCAB:")
print(vocab)

print("\nDECODER:")
print(decoder)

print("\nENCODER:")
print(encoder)

# One hot encode vocab
Passing in raw numbers into an LSTM is apparently a bad idea. We should be passing in one hot encoded values because it does not introduce 'order' or 'weight' for each letter that we don't want. 

In [None]:
# Creates one hot encoding from 0.... vocab length
onehot = np.eye(len(vocab),dtype=bool)
print(onehot)

# Construct LSTM


In [None]:
# X Dimension = (Total sentences) x (Sentence Length) x (Character Vector dimension
try:
    model = load_model("ckpt/model.h5py")
    print("Using saved model")
except(OSError):
    layerSize = 128
    input_shape = (50, len(vocab))

    model = Sequential()
    model.add(LSTM(128, input_shape=input_shape)))
    model.add(Dense(len(vocab)))
    model.add(Activation('softmax'))
    
    optimizer = RMSprop(lr=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)
    
    print("Using new model")
    print(model.summary())


# Train LSTM
We read in slices from the file, vectorize the slice, form our training set, and pass this set into our LSTM

In [None]:
total_chunks = math.ceil(data_row_count / chunk_size)
chunk_processed = 0

# Saves entire model, tracks loss, save after each epoc 
checkpointer = ModelCheckpoint(
    filepath="ckpt/model.h5py",
    verbose=1,
    save_best_only=False,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    period=1)

callbacks = [checkpointer]


# Reads in chunk_size amount of rows from csv
for file_epocs in range(numRuns):
    for chunk in pd.read_csv(data_source, chunksize=chunk_size, sep=';', dtype={'Y':np.int8}):
        # Get input vectors ( Convert X to numpy array 
        print("Processing chunk", chunk_processed + 1," of ", total_chunks * numRuns)
        rawX = chunk['X'].map(lambda x: np.fromstring(x, dtype=np.uint8, sep=" ")).values
        rawY = chunk['Y'].values

        X = np.zeros(shape=(chunk_size, len(rawX[0]),len(vocab)),dtype=bool)
        Y = np.zeros((chunk_size, len(vocab)), dtype=np.bool)

        for i in range(len(rawX)):
            X[i] = [onehot[value] for value in rawX[i]]
            Y[i] = onehot[rawY[i]]

        # Train model on X and Y
        print("Training Model on chunk", chunk_processed + 1," of ", total_chunks * numRuns)
        model.fit(X, Y, batch_size=200, epochs=5, verbose=1, callbacks=callbacks)
        print("\n")
        chunk_processed += 1


print("Model Training Completed")



# Test Model

In [None]:
def sample(preds, temperature=1.0):

    # helper function to sample an index from a probability array

    preds = np.asarray(preds).astype('float64')

    preds = np.log(preds) / temperature

    exp_preds = np.exp(preds)

    preds = exp_preds / np.sum(exp_preds)

    probas = np.random.multinomial(1, preds, 1)

    return np.argmax(probas)

In [None]:
sentence_raw = "Hello, this text needs to be at MIN 50 characters HELLO."

# too lazy to figue our the real way to do this
sentence = [encoder[letter] for letter in sentence_raw]

x = np.zeros((1,50, len(vocab)))
offset = 0
print(sentence_raw,end='')
for j in range(0,500):
    for i,letter in enumerate(sentence[offset:offset+50]):
        x[0,i] = onehot[letter]

    preds = model.predict(x, verbose=0)[0]
    predicted_index = sample(preds, 0.2)
    sentence.append(predicted_index)
    offset+=1
    print(decoder[predicted_index], end="")
