In [39]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import collections
import re
import tensorflow as tf
from datetime import datetime

In [2]:
vec_file = "C:/Users/tug64107/Desktop/AI/_data/glove.6B/glove.6B.50d.txt"
fp = open(vec_file,encoding="utf-8")
glove_dict = {}
glove_words = []
glove_vecs = []
counter = 0
for row in fp.readlines():
    if counter%40000==0: print(counter)
    counter+=1
    row = row.split(" ")
    word = row[0]
    vec = np.asarray([float(i) for i in row[1:]])
    glove_words.append(word)
    glove_vecs.append(vec)
    glove_dict[word] = vec
glove_vecs = np.asarray(glove_vecs)
fp.close()
print("loaded word vecs")

0
40000
80000
120000
160000
200000
240000
280000
320000
360000
loaded word vecs


In [22]:
print(glove_dict["'"])

[-0.039369  1.2036    0.35401  -0.55999  -0.52078  -0.66988  -0.75417
 -0.6534   -0.23246   0.58686  -0.40797   1.2057   -1.11      0.51235
  0.1246    0.05306   0.61041  -1.1295   -0.11834   0.26311  -0.72112
 -0.079739  0.75497  -0.023356 -0.56079  -2.1037   -1.8793   -0.179
 -0.14498  -0.63742   3.181     0.93412  -0.6183    0.58116   0.58956
 -0.19806   0.42181  -0.85674   0.33207   0.020538 -0.60141   0.50403
 -0.083316  0.20239   0.443    -0.060769 -0.42807  -0.084135  0.49164
  0.085654]


In [35]:
# load data
def load_data(file_name):
    fp = open(file_name)
    content = []
    for row in fp.readlines():
        content+=re.findall("\w+|[.,!?;&':]",row.lower())
    content = np.asarray(content)
    return content


text_file = "C:/Users/tug64107/Desktop/AI/_data/Shakespeare/Shakespeare.txt"
text = load_data(text_file)
print(text[:10])
vocab = set(text)
print(text.shape)
print("Number of different words: {}".format(len(vocab)))

['first' 'citizen' ':' 'before' 'we' 'proceed' 'any' 'further' ',' 'hear']
(261029,)
Number of different words: 11464


In [40]:
# train test split
train_size = 200000
text_train, text_test = text[:train_size], text[train_size:]
test_size = text_test.shape[0]

In [42]:
def fetch_batch(seq_length, batch_size, text, dictionary):
    X = []
    y = []
    for i in range(batch_size):
        index = np.random.randint(text.shape[0]-seq_length-1)
        X_words = text[index:index+seq_length]
        y_words = text[index+1:index+1+seq_length]
        X_vecs = [dictionary[w] for w in X_words]
        y_vecs = [dictionary[w] for w in y_words]
        X.append(X_vecs)
        y.append(y_vecs)
    return np.asarray(X), np.asarray(y)

In [None]:
# get log file path
now = datetime.utcnow.strftime("%y%m%d-%H%M")
run_name = "run"
root_logdir = "/logs"
logdir_test = "{}/{}_{}_test".format(root_logdir,run_name,now)
logdir_train
root_savedir = "/checkpoints"
savedir = "{}/{}_{}".format(root_savedir,run_name,now)

# hyper parameters
vec_size = 50
seq_length = 64
learning_rate = 0.1
batch_size = 20
n_epochs = 30
n_batches = int(train_size/batch_size/seq_length)
print("seq_length:",seq_length)
print("learning_rate:",learning_rate)
print("batch_size:",batch_size)
print("n_epochs:",n_epochs)
print("n_batches:",n_batches)

tf.reset_default_graph()
X = tf.placeholder(tf.float32,[None, seq_length,vec_size])
y = tf.placeholder(tf.int32, [None, seq_length, vec_size]) #None*seq_length*n_outputs

with tf.variable_scope("forward"):
    cell_1 = tf.nn.rnn_cell.LSTMCell(64,use_peepholes=True,name="cell_1")
    cell_2 = tf.nn.rnn_cell.LSTMCell(32,use_peepholes=True,name="cell_2")

    multi_layer_cell = tf.nn.rnn_cell.MultiRNNCell([cell_1,cell_2])
    h_states, fin_state = tf.nn.dynamic_rnn(multi_layer_cell,X,dtype=tf.float32) #h_states: None*seq_length*32, fin_state: tupple of last c and h states[None*32, None*32]
    outputs = tf.layers.dense(h_states,vec_size,name="dense") #None*seq_length*vec_size

with tf.variable_scope("training"):
    loss = tf.losses.mean_squared_error(Y,outputs)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.variable_scope("logging"):
    writer_test = tf.summary.FileWriter(logdir,tf.get_default_graph())
    writer_train = tf.summary.FileWriter(logdir,tf.get_default_graph())
    loss_summary = tf.summary.scalar("loss_summary",loss)
    summary = tf.summary.merge_all()
    saver = tf.train.Saver()

#train
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        for batch in range(n_batches):
            X_train, y_train = fetch_batch(seq_length, batch_size, text_train, glove_dict)
            sess.run(loss, feed_dict={X:X_train,y:y_train})
        if(epoch%5==0):
            X_test, y_test = fetch_batch(seq_length, int(test_size/seq_length), text_test, glove_dict)
            loss_test, summary_test =  sess.run([loss,summary],feed_dict={X:X_test,y:y_test})
            writer_test.add_summary(summary_test,epoch)
            
            X_train, y_train = fetch_batch(seq_length, int(train_size/seq_length), text_train, glove_dict)
            loss_train, summary_train = sess.run([loss, summary],feed_dict={X:X_train, y:y_train})
            writer_train.add_summary(summary_train,epoch)
            
        if(epoch%100==0):
            # save
            save_path = saver.save(sess, "./checkpoints_LSTM/model.ckpt")
            print("Model saved in path: %s" % save_path)
            # generate sequence
            seq_onehot = np.zeros([seq_length, n_inputs])
            story_length = 500
            for i_char in range(story_length):
                X_batch = np.reshape(seq_onehot[-seq_length:], [1, seq_length, n_inputs])
                gen_onehot = sess.run(predict_onehot, feed_dict={X: X_batch})
                seq_onehot = np.append(seq_onehot, gen_onehot[0, -1, :].reshape([1, -1]), axis=0)

            # print result:
            story = ""
            for i in range(seq_onehot.shape[0]):
                story += ix_to_char[np.argmax(seq_onehot[i])]
            print(story)




