In [40]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
import collections
import re
import tensorflow as tf
from datetime import datetime
from time import time

In [41]:
vec_file = "C:/Users/tug64107/Desktop/AI/_data/glove.6B/glove.6B.50d.txt"
fp = open(vec_file,encoding="utf-8")
glove_dict = {}
glove_words = []
glove_vecs = []
counter = 0
for row in fp.readlines():
    if counter%40000==0: print(counter)
    counter+=1
    row = row.split(" ")
    word = row[0]
    vec = np.asarray([float(i) for i in row[1:]])
    glove_words.append(word)
    glove_vecs.append(vec)
    glove_dict[word] = vec
glove_vecs = np.asarray(glove_vecs)
fp.close()
print("loaded word vecs")

0
40000
80000
120000
160000
200000
240000
280000
320000
360000
loaded word vecs


In [42]:
# tree to find nearest word
tree = NearestNeighbors(2,algorithm="kd_tree")
tree.fit(glove_vecs)
tree.kneighbors(glove_dict["cat"].reshape([1,-1]),2,False)

array([[5450, 2926]], dtype=int64)

In [43]:
print(glove_words[2926])

dog


KeyError: 'twere'

In [None]:
# load data
def load_data(file_name):
    fp = open(file_name)
    content = []
    for row in fp.readlines():
        content+=re.findall("\w+|[.,!?;&':]",row.lower())
    content = np.asarray(content)
    return content

def mark_unknown(word_list, dictionary):
    count = 0
    for i in range(len(word_list)):
        if word_list[i] not in dictionary:
            count+=1
            word_list[i] = "unk"
            print(i)
    print("number of unknown words:",count)

text_file = "C:/Users/tug64107/Desktop/AI/_data/Shakespeare/Shakespeare.txt"
text = load_data(text_file)
mark_unknown(text,glove_dict)
print(text[:10])
vocab = set(text)
print(text.shape)
print("Number of different words: {}".format(len(vocab)))

In [None]:
print(text[260913])

In [None]:
# train test split
train_size = 200000
text_train, text_test = text[:train_size], text[train_size:]
test_size = text_test.shape[0]

In [None]:
def fetch_batch(seq_length, batch_size, text, dictionary):
    X = []
    y = []
    for i in range(batch_size):
        index = np.random.randint(text.shape[0]-seq_length-1)
        X_words = text[index:index+seq_length]
        y_words = text[index+1:index+1+seq_length]
        X_vecs = [dictionary[w] for w in X_words]
        y_vecs = [dictionary[w] for w in y_words]
        X.append(X_vecs)
        y.append(y_vecs)
    return np.asarray(X), np.asarray(y)

In [None]:
def vec_to_word(vec,tree,glove_words):
    indice = tree.kneighbors(vec.reshape([1,-1]),1,False)
    return glove_words[indice[0,0]]


In [48]:
# get log file path
now = datetime.utcnow().strftime("%y%m%d-%H%M")
run_name = "run"
root_logdir = "logs"
logdir_test = "{}/{}_{}_test".format(root_logdir,run_name,now)
logdir_train = "{}/{}_{}_train".format(root_logdir,run_name,now)
root_savedir = "checkpoints"
savedir = "{}/{}_{}".format(root_savedir,run_name,now)

# hyper parameters
vec_size = 50
seq_length = 32
learning_rate = 1
batch_size = 20
n_epochs = 30
n_batches = int(train_size/batch_size/seq_length)
print("seq_length:",seq_length)
print("learning_rate:",learning_rate)
print("batch_size:",batch_size)
print("n_epochs:",n_epochs)
print("n_batches:",n_batches)

# construction phase
tf.reset_default_graph()
X = tf.placeholder(tf.float32,[None, seq_length,vec_size])
y = tf.placeholder(tf.float32, [None, seq_length, vec_size]) #None*seq_length*vec_size

with tf.variable_scope("forward"):
    cell_1 = tf.contrib.rnn.LSTMCell(128, use_peepholes=True)
    cell_2 = tf.contrib.rnn.LSTMCell(64, use_peepholes=True)
    cell_3 = tf.contrib.rnn.LSTMCell(64, use_peepholes=True)

    multi_layer_cell = tf.nn.rnn_cell.MultiRNNCell([cell_1,cell_2,cell_3])
    h_states, fin_state = tf.nn.dynamic_rnn(multi_layer_cell,X,dtype=tf.float32) #h_states: None*seq_length*32, fin_state: tupple of last c and h states[None*32, None*32]
    outputs = tf.layers.dense(h_states,vec_size,name="dense") #None*seq_length*vec_size

with tf.variable_scope("training"):
    loss = tf.losses.mean_squared_error(y,outputs)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.variable_scope("logging"):
    writer_test = tf.summary.FileWriter(logdir_test,tf.get_default_graph())
    writer_train = tf.summary.FileWriter(logdir_train,tf.get_default_graph())
    loss_summary = tf.summary.scalar("loss_summary",loss)
    summary = tf.summary.merge_all()
    saver = tf.train.Saver()

# execution phase
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(n_epochs):
        if(epoch%5==0):
            start = time()
            X_test, y_test = fetch_batch(seq_length, int(test_size/seq_length), text_test, glove_dict)
            loss_test, summary_test =  sess.run([loss,summary],feed_dict={X:X_test,y:y_test})
            writer_test.add_summary(summary_test,epoch)
            
            X_train, y_train = fetch_batch(seq_length, int(train_size/seq_length), text_train, glove_dict)
            loss_train, summary_train = sess.run([loss, summary],feed_dict={X:X_train, y:y_train})
            writer_train.add_summary(summary_train,epoch)
            
            saver.save(sess,savedir+"/model.ckpt")
            
            print("Epoch: %d  loss test: %.4f  loss train: %.4f"%(epoch,loss_test,loss_train))
            
            # generate sequence
            seq = np.zeros([seq_length, vec_size])
            story_length = 200
            for i_char in range(story_length):
                X_batch = np.reshape(seq[-seq_length:], [1, seq_length, vec_size])
                gen = sess.run(outputs, feed_dict={X: X_batch})
                seq = np.append(seq,gen[0, -1, :].reshape([1, -1]), axis=0)

            # print result:
            story = ""
            for i in range(seq.shape[0]):
                story += " " + vec_to_word(seq[i],tree,glove_words)
            print(story)
            print("time for logging and generating: %d"%(time()-start))
        for batch in range(n_batches):
            X_train, y_train = fetch_batch(seq_length, batch_size, text_train, glove_dict)
            sess.run(training_op, feed_dict={X:X_train,y:y_train})

seq_length: 32
learning_rate: 1
batch_size: 20
n_epochs: 30
n_batches: 312
Epoch: 0  loss test: 0.5015  loss train: 0.5010
 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k978-1 k