In [93]:
import time
from collections import namedtuple
import numpy as np
import tensorflow as tf
import nltk
from nltk import word_tokenize
import re

with open('episodes\\HP1.txt', 'r') as f:
    text=f.read()
vocab_list = sorted(set([re.sub('[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~0-9]', '', i) for i in word_tokenize(text.lower()) if i[0].isalpha()]))
vocab_list.pop(0)
vocab = sorted(set(text))
vocab_to_int = {c: i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

#print(vocab)
#print(vocab_to_int)
#print(int_to_vocab)

#encoded contains the entire text, encoded character-wise. Example: MONICA: 29 56 ...etc where 29 is M and 56 is O
#print(encoded)

def get_batches(arr, batch_size, n_steps):
    # Get the number of characters per batch and number of batches we can make
    chars_per_batch = batch_size * n_steps
    n_batches = len(arr)//chars_per_batch
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * chars_per_batch]
    
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    for n in range(0, arr.shape[1], n_steps):
        # The features
        x = arr[:, n:n+n_steps]
        # The targets, shifted by one
        y_temp = arr[:, n+1:n+n_steps+1]
        
        # For the very last batch, y will be one character short at the end of 
        # the sequences which breaks things. To get around this, I'll make an 
        # array of the appropriate size first, of all zeros, then add the targets.
        # This will introduce a small artifact in the last batch, but it won't matter.
        y = np.zeros(x.shape, dtype=x.dtype)
        y[:,:y_temp.shape[1]] = y_temp
        
        yield x, y


#batches = get_batches(encoded, 10, 50)
#x,y = next(batches)

#print(x,y)
print(len(vocab_list))

5855


In [94]:
def build_inputs(batch_size, num_steps):
    ''' Define placeholders for inputs, targets, and dropout'''
    # Declare placeholders we'll feed into the graph
    inputs = tf.placeholder(tf.int32, [batch_size, num_steps], name='inputs')
    targets = tf.placeholder(tf.int32, [batch_size, num_steps], name='targets')
    
    # Keep probability placeholder for drop out layers
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    
    return inputs, targets, keep_prob

In [95]:
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    ''' Build LSTM cell.
        lstm_size: Size of the hidden layers in the LSTM cells
        num_layers: Number of LSTM layers'''
    
    #Build the LSTM Cell
    
    def build_cell(lstm_size, keep_prob):
        # Use a basic LSTM cell
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        
        # Add dropout to the cell
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        return drop
    
    
    # Stack up multiple LSTM layers, for deep learning
    cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size, keep_prob) for _ in range(num_layers)])
    initial_state = cell.zero_state(batch_size, tf.float32)
    
    return cell, initial_state

In [96]:
def build_output(lstm_output, in_size, out_size):
    ''' Build a softmax layer, return the softmax output and logits.
    
        Arguments
        ---------
        
        x: Input tensor
        in_size: Size of the input tensor, for example, size of the LSTM cells
        out_size: Size of this softmax layer
    
    '''

    # Reshape output so it's a bunch of rows, one row for each step for each sequence.
    # That is, the shape should be batch_size*num_steps rows by lstm_size columns
    seq_output = tf.concat(lstm_output, axis=1)
    x = tf.reshape(seq_output, [-1, in_size])
    
    # Connect the RNN outputs to a softmax layer
    with tf.variable_scope('softmax'):
        softmax_w = tf.Variable(tf.truncated_normal((in_size, out_size), stddev=0.1))
        softmax_b = tf.Variable(tf.zeros(out_size))
    
    # Since output is a bunch of rows of RNN cell outputs, logits will be a bunch
    # of rows of logit outputs, one for each step and sequence
    logits = tf.matmul(x, softmax_w) + softmax_b
    
    # Use softmax to get the probabilities for predicted characters
    out = tf.nn.softmax(logits, name='predictions')
    
    return out, logits

In [97]:
def build_loss(logits, targets, lstm_size, num_classes):
    ''' Calculate the loss from the logits and the targets.
    
        Arguments
        ---------
        logits: Logits from final fully connected layer
        targets: Targets for supervised learning
        lstm_size: Number of LSTM hidden units
        num_classes: Number of classes in targets
        
    '''
    
    # One-hot encode targets and reshape to match logits, one row per batch_size per step
    y_one_hot = tf.one_hot(targets, num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    
    # Softmax cross entropy loss
    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
    loss = tf.reduce_mean(loss)
    return loss

In [98]:
def build_optimizer(loss, learning_rate, grad_clip):
    ''' Build optmizer for training, using gradient clipping.
    
        Arguments:
        loss: Network loss
        learning_rate: Learning rate for optimizer
    
    '''
    
    # Optimizer for training, using gradient clipping to control exploding gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

In [99]:
class CharRNN:
    
    def __init__(self, num_classes, batch_size=64, num_steps=50, 
                       lstm_size=128, num_layers=2, learning_rate=0.001, 
                       grad_clip=5, sampling=False):
    
        # When we're using this network for sampling later, we'll be passing in
        # one character at a time, so providing an option for that
        if sampling == True:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps
        
        tf.reset_default_graph()
        
        # Build the input placeholder tensors
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)

        # Build the LSTM cell
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)

        ### Run the data through the RNN layers
        # First, one-hot encode the input tokens
        x_one_hot = tf.one_hot(self.inputs, num_classes)
        
        # Run each sequence step through the RNN and collect the outputs
        outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state=self.initial_state)
        self.final_state = state
        
        # Get softmax predictions and logits
        self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)
        
        # Loss and optimizer (with gradient clipping)
        self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

In [117]:
epochs = 200
# Print losses every N interations
print_every_n = 50

# Save every N iterations
save_every_n = 200

batch_size = 128
num_steps = 50
lstm_size = 128
num_layers = 2
learning_rate =0.001

model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Use the line below to load a checkpoint and resume training
    #saver.restore(sess, 'checkpoints/______.ckpt')
    counter = 0
    for e in range(epochs):
        # Train network
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in get_batches(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs: x,
                    model.targets: y,
                    model.keep_prob: 0.6,
                    model.initial_state: new_state}
            batch_loss, new_state, _ = sess.run([model.loss, 
                                                 model.final_state, 
                                                 model.optimizer], 
                                                 feed_dict=feed)
            if (counter % print_every_n == 0):
                end = time.time()
                print('Epoch: {}/{}... '.format(e+1, epochs),
                      'Training Step: {}... '.format(counter),
                      'Training loss: {:.4f}... '.format(batch_loss),
                      '{:.4f} sec/batch'.format((end-start)))
        
            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

Epoch: 1/200...  Training Step: 50...  Training loss: 3.1779...  0.1000 sec/batch
Epoch: 2/200...  Training Step: 100...  Training loss: 3.1416...  0.1000 sec/batch
Epoch: 3/200...  Training Step: 150...  Training loss: 3.1237...  0.1370 sec/batch
Epoch: 3/200...  Training Step: 200...  Training loss: 2.9573...  0.1000 sec/batch
Epoch: 4/200...  Training Step: 250...  Training loss: 2.7559...  0.1000 sec/batch
Epoch: 5/200...  Training Step: 300...  Training loss: 2.6160...  0.1000 sec/batch
Epoch: 6/200...  Training Step: 350...  Training loss: 2.6230...  0.1020 sec/batch
Epoch: 6/200...  Training Step: 400...  Training loss: 2.4779...  0.1020 sec/batch
Epoch: 7/200...  Training Step: 450...  Training loss: 2.4507...  0.1010 sec/batch
Epoch: 8/200...  Training Step: 500...  Training loss: 2.3909...  0.0990 sec/batch
Epoch: 9/200...  Training Step: 550...  Training loss: 2.3812...  0.0990 sec/batch
Epoch: 9/200...  Training Step: 600...  Training loss: 2.3517...  0.1000 sec/batch
Epoch

Epoch: 45/200...  Training Step: 3000...  Training loss: 1.7675...  0.0970 sec/batch
Epoch: 45/200...  Training Step: 3050...  Training loss: 1.8069...  0.1010 sec/batch
Epoch: 46/200...  Training Step: 3100...  Training loss: 1.7515...  0.1020 sec/batch
Epoch: 47/200...  Training Step: 3150...  Training loss: 1.7672...  0.0990 sec/batch
Epoch: 48/200...  Training Step: 3200...  Training loss: 1.7489...  0.0990 sec/batch
Epoch: 48/200...  Training Step: 3250...  Training loss: 1.7464...  0.1000 sec/batch
Epoch: 49/200...  Training Step: 3300...  Training loss: 1.7704...  0.1000 sec/batch
Epoch: 50/200...  Training Step: 3350...  Training loss: 1.7309...  0.1000 sec/batch
Epoch: 50/200...  Training Step: 3400...  Training loss: 1.8050...  0.1000 sec/batch
Epoch: 51/200...  Training Step: 3450...  Training loss: 1.7553...  0.1020 sec/batch
Epoch: 52/200...  Training Step: 3500...  Training loss: 1.7196...  0.1010 sec/batch
Epoch: 53/200...  Training Step: 3550...  Training loss: 1.7143..

Epoch: 116/200...  Training Step: 7850...  Training loss: 1.5540...  0.0989 sec/batch
Epoch: 117/200...  Training Step: 7900...  Training loss: 1.5043...  0.0990 sec/batch
Epoch: 117/200...  Training Step: 7950...  Training loss: 1.5246...  0.1000 sec/batch
Epoch: 118/200...  Training Step: 8000...  Training loss: 1.5410...  0.1000 sec/batch
Epoch: 119/200...  Training Step: 8050...  Training loss: 1.5035...  0.1000 sec/batch
Epoch: 120/200...  Training Step: 8100...  Training loss: 1.4969...  0.1000 sec/batch
Epoch: 120/200...  Training Step: 8150...  Training loss: 1.5309...  0.0620 sec/batch
Epoch: 121/200...  Training Step: 8200...  Training loss: 1.4935...  0.1010 sec/batch
Epoch: 122/200...  Training Step: 8250...  Training loss: 1.5184...  0.1000 sec/batch
Epoch: 123/200...  Training Step: 8300...  Training loss: 1.4855...  0.0990 sec/batch
Epoch: 123/200...  Training Step: 8350...  Training loss: 1.5054...  0.0990 sec/batch
Epoch: 124/200...  Training Step: 8400...  Training lo

Epoch: 186/200...  Training Step: 12600...  Training loss: 1.4023...  0.1010 sec/batch
Epoch: 187/200...  Training Step: 12650...  Training loss: 1.4200...  0.1010 sec/batch
Epoch: 187/200...  Training Step: 12700...  Training loss: 1.4602...  0.0990 sec/batch
Epoch: 188/200...  Training Step: 12750...  Training loss: 1.4252...  0.1000 sec/batch
Epoch: 189/200...  Training Step: 12800...  Training loss: 1.4101...  0.1000 sec/batch
Epoch: 189/200...  Training Step: 12850...  Training loss: 1.4347...  0.1010 sec/batch
Epoch: 190/200...  Training Step: 12900...  Training loss: 1.4522...  0.0980 sec/batch
Epoch: 191/200...  Training Step: 12950...  Training loss: 1.4448...  0.1000 sec/batch
Epoch: 192/200...  Training Step: 13000...  Training loss: 1.4209...  0.1010 sec/batch
Epoch: 192/200...  Training Step: 13050...  Training loss: 1.4245...  0.1000 sec/batch
Epoch: 193/200...  Training Step: 13100...  Training loss: 1.4212...  0.0990 sec/batch
Epoch: 194/200...  Training Step: 13150... 

In [118]:
tf.train.get_checkpoint_state('checkpoints')

model_checkpoint_path: "checkpoints\\i13600_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i200_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i400_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i600_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i800_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1000_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1200_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1400_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1600_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i1800_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2000_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2200_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2400_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2600_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i2800_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i3000_l128.ckpt"
all_model_checkpoint_paths: "checkpoints\\i3200_l128.ckpt"
all_m

In [119]:
def pick_top_n(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p / np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [120]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    samples = [c for c in prime]
    model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        samples.append(int_to_vocab[c])

        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
    return ''.join(samples)

In [121]:
tf.train.latest_checkpoint('checkpoints')

'checkpoints\\i13600_l128.ckpt'

In [122]:
def editDistDP(str1, str2, m, n): 
    dp = [[0 for x in range(n+1)] for x in range(m+1)] 
    for i in range(m+1): 
        for j in range(n+1): 
            if i == 0: 
                dp[i][j] = j    
            elif j == 0: 
                dp[i][j] = i    
            elif str1[i-1] == str2[j-1]: 
                dp[i][j] = dp[i-1][j-1] 
            else: 
                dp[i][j] = 1 + min(dp[i][j-1],dp[i-1][j],dp[i-1][j-1])    
    return dp[m][n] 

In [123]:
def jaccard_similarity(word1,word2):
    return len(set(word1).intersection(set(word2))) / len(set(word1).union(set(word2)))

In [124]:
def strcmp(word1,word2):
    return abs(len(word1) - len(word2))

In [125]:
def min_edit_wrapper(word):

    l1=[]
    word = re.sub('[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~0-9]', '', word)
    for i in vocab_list:
        edit_dist = editDistDP(i,word,len(i),len(word))
        l1.append((strcmp(i,word),edit_dist,jaccard_similarity(i,word),i,word))
    return sorted([i for i in l1 if i[0]<3])



In [126]:
def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()

In [128]:
checkpoint = tf.train.latest_checkpoint('checkpoints')
samp = sample(checkpoint, 10000, lstm_size, len(vocab), prime="THE MIRROR OF ERISED")

INFO:tensorflow:Restoring parameters from checkpoints\i13600_l128.ckpt


In [129]:
fp = open("expected_output4","w")
fp.write(samp)
fp.close()

In [130]:
import copy

backupsamp = copy.deepcopy(samp)
backupsamp = word_tokenize(backupsamp)
for i in range(len(backupsamp)):
    if backupsamp[i].lower() not in vocab_list and len(backupsamp[i].lower())>1:
        backupsamp[i] = min_edit_wrapper(backupsamp[i].lower())[0][-2]


        

In [131]:

backupsamp2 = untokenize(backupsamp)
fp = open("output4","w")
fp.write(backupsamp2)
fp.close()