In [1]:
import config
import tensorflow as tf
tf.app.flags.DEFINE_string('f', '', 'kernel')
from collections import deque
import model
from dataUtils import *
from logger import MyLogger
import sys
import PTB_data_reader
import time
import numpy as np
import lstm_char_cnn
import pickle
import dataloader
tf.logging.set_verbosity(tf.logging.ERROR)


logger = MyLogger("RDMTrain")

# load twitter data
# load_data(FLAGS.data_file_path)
load_data_fast()

#load PTB data
# word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \
#     PTB_data_reader.load_data(FLAGS.data_dir, FLAGS.max_word_length, char_vocab, eos=FLAGS.EOS)
word_vocab, char_vocab, word_tensors, char_tensors = \
    PTB_data_reader.load_data_fast()
max_word_length = FLAGS.max_word_length
train_reader = PTB_data_reader.DataReader(word_tensors['train'], char_tensors['train'],
                          FLAGS.batch_size, FLAGS.max_sent_len) 

#load sentiment analysis data
sentiReader = dataloader.SentiDataLoader(
                                        dirpath = '/home/hadoop/trainingandtestdata',
                                        trainfile = 'training.1600000.processed.noemoticon.csv', 
                                        testfile = 'testdata.manual.2009.06.14.csv', 
                                        charVocab = char_vocab
                        )
# sentiReader.load_data()
sentiReader.load_data_fast(
                        '/home/hadoop/ERD/data/senti_train_data.pickle',
                        '/home/hadoop/ERD/data/senti_train_label.pickle',
                        '/home/hadoop/ERD/data/senti_test_data.pickle',
                        '/home/hadoop/ERD/data/senti_test_label.pickle'
                          )


# (self, input_dim, hidden_dim, max_seq_len, max_word_num, class_num, action_num):
print(  FLAGS.embedding_dim, FLAGS.hidden_dim, 
            FLAGS.max_seq_len, FLAGS.max_sent_len, 
                FLAGS.class_num, FLAGS.action_num   )
logger.info(    (FLAGS.embedding_dim, FLAGS.hidden_dim, 
                    FLAGS.max_seq_len, FLAGS.max_sent_len, 
                        FLAGS.class_num, FLAGS.action_num)  )

print(get_curtime() + " Data loaded.")
logger.info(get_curtime() + " Data loaded.")

# # save the Twitter data
# data = get_data()
# with open('data/data_dict.txt', 'wb') as handle:
#     pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

# # save the PTB data
# with open('data/char_tensors.txt', 'wb') as handle:
#     pickle.dump(char_tensors, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('data/word_tensors.txt', 'wb') as handle:
#     pickle.dump(word_tensors, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('data/char_vocab.txt', 'wb') as handle:
#     pickle.dump(char_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('data/word_vocab.txt', 'wb') as handle:
#     pickle.dump(word_vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

# save the senti data
# with open('data/senti_train_data.pickle', 'wb') as handle:
#     pickle.dump(sentiReader.train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('data/senti_train_label.pickle', 'wb') as handle:
#     pickle.dump(sentiReader.train_label, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('data/senti_test_data.pickle', 'wb') as handle:
#     pickle.dump(sentiReader.test_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('data/senti_test_label.pickle', 'wb') as handle:
#     pickle.dump(sentiReader.test_label, handle, protocol=pickle.HIGHEST_PROTOCOL)



from model import adict


def InferRDMTrainGraph(char_model, lm, senti_model, rdm_model, batchsize,
                            max_seq_len, max_word_num, max_char_num, 
                                hidden_dim, embedding_dim, class_num):
    input_x = tf.placeholder(
                        tf.int32, 
                        shape = [
                                 batchsize, 
                                 max_seq_len, 
                                 max_word_num, 
                                 max_char_num
                                 ], 
                        name="input_x"
                    )
    input_y = tf.placeholder(
                        tf.float32, 
                        shape = [batchsize, class_num], 
                        name="input_y"
                    )
    x_len = tf.placeholder(
                        tf.int32, 
                        [batchsize], 
                        name="x_len"
                    )
    init_states = tf.placeholder(
                        tf.float32, 
                        [batchsize, hidden_dim], 
                        name="init_states"
                    )
    x_reshape = tf.reshape(
                        input_x, 
                        [
                         batchsize*max_seq_len, 
                         max_word_num, 
                         max_char_num
                        ]
                    )
    print("x_reshape:", x_reshape)
    x_embedding = char_model(x_reshape)
    print("x_embedding:", x_embedding)
    cnn_outs = tf.reshape(
                        x_embedding, 
                        [
                         batchsize*max_seq_len, 
                         max_word_num, 
                         sum(char_model.kernel_features)
                        ]
                    )
    print("cnn_outs:", cnn_outs)
    # words_embedding, sentence_embedding = lm(cnn_outs)
    cnn_outs_list = [tf.squeeze(x, [1]) 
    for x in tf.split(cnn_outs, max_word_num, 1)]
    rdm_init_state = lm.cell.zero_state(
                            batchsize*max_seq_len, 
                            dtype=tf.float32
                        )
    words_embedding, sentence_embedding = tf.contrib.rnn.static_rnn(
                                        lm.cell, 
                                        cnn_outs_list,
                                        initial_state=rdm_init_state, 
                                        dtype=tf.float32
                                    )     
    words_embedding = tf.identity(words_embedding, 
                                    "rnn_out_puts")
    words_embedding = tf.transpose(words_embedding, 
                                        [1, 0, 2])
    print("RDM words_embedding:", words_embedding)
#     x_senti = senti_model(words_embedding)
    words_feature = tf.math.reduce_max( words_embedding , axis=1)
    
    with tf.variable_scope("Train_RDM", reuse=tf.AUTO_REUSE):
        fcn_layer = tf.layers.Dense(hidden_dim, activation=tf.compat.v1.keras.activations.sigmoid)
        x_senti =  fcn_layer(sentence_embedding[-1][-1] + words_feature )
        print("x_senti:", x_senti)
        RDM_Input = tf.reshape(
                            x_senti, 
                            [
                             batchsize, 
                             max_seq_len, 
                             hidden_dim
                            ]
                        )  
        df_outputs, df_last_state = rdm_model(RDM_Input, x_len, init_states)
        
        l2_loss = tf.constant(0.0)
        w_ps = tf.Variable(tf.truncated_normal([hidden_dim, class_num], stddev=0.1)) #
        b_ps = tf.Variable(tf.constant(0.01, shape=[class_num])) #
        l2_loss += tf.nn.l2_loss(w_ps) 
        l2_loss += tf.nn.l2_loss(b_ps) 

        pre_scores = tf.nn.xw_plus_b(df_last_state, w_ps, b_ps, name="p_scores")
        predictions = tf.argmax(pre_scores, 1, name="predictions")

        r_outputs = tf.reshape(df_outputs, [-1, hidden_dim]) #[batchsize*max_seq_len, output_dim]
        scores_seq = tf.nn.softmax(tf.nn.xw_plus_b(r_outputs, w_ps, b_ps)) # [batchsize * max_seq_len, class_num] 
        out_seq = tf.reshape(scores_seq, [-1, max_seq_len, class_num], name="out_seq") #[batchsize, max_seq_len, class_num]

        df_losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=pre_scores, labels=input_y)
        loss = tf.reduce_mean(df_losses) + 0.1 * l2_loss

        correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
        
    df_global_step = tf.Variable(0, name="global_step", trainable=False)
    df_train_op = tf.train.AdamOptimizer(0.01).minimize(loss, df_global_step)
    return adict(
                lm_drop_out = lm.drop_out,
                dropout_keep_prob = rdm_model.dropout_keep_prob,
                input_x = input_x,
                input_y = input_y,
                x_len = x_len,
                init_states = init_states,
                pre_scores = pre_scores,
                predictions = predictions,
                r_outputs = r_outputs,
                scores_seq = scores_seq,
                out_seq = out_seq,
                df_losses = df_losses,
                loss = loss,
                correct_predictions = correct_predictions,
                accuracy = accuracy,
                df_global_step = df_global_step,
                df_train_op = df_train_op
            )



W0901 12:17:45.989879 139700702275392 deprecation_wrapper.py:119] From /home/hadoop/ERD/model.py:6: The name tf.losses.Reduction is deprecated. Please use tf.compat.v1.losses.Reduction instead.

Using TensorFlow backend.


max_sent: 31 ,  max_seq_len: 101
5802 data loaded


I0901 12:18:02.012879 139700702275392 logger.py:24] (300, 64, 101, 31, 2, 2)
I0901 12:18:02.013772 139700702275392 logger.py:24] 2019-09-01 12:18:02 Data loaded.


300 64 101 31 2 2
2019-09-01 12:18:02 Data loaded.


In [2]:
def TrainRDMModel(sess, saver, summary_writter, logger, mm, batch_size, t_acc, t_steps, model_dir, new_data_len=[]):
    sum_loss = 0.0
    sum_acc = 0.0
    ret_acc = 0.0
    init_states = np.zeros([batch_size, FLAGS.hidden_dim], dtype=np.float32)

    for i in range(t_steps):
        if len(new_data_len) > 0:
            x, x_len, y = get_df_batch(i, batch_size, new_data_len)
        else:
            x, x_len, y = get_df_batch(i, batch_size)
        feed_dic = {
                        mm.input_x: x, 
                        mm.x_len: x_len, 
                        mm.input_y: y, 
                        mm.init_states: init_states, 
                        mm.dropout_keep_prob: 0.8,
                        mm.lm_drop_out: 0.8
        }
        _, step, loss, acc = sess.run([mm.df_train_op, mm.df_global_step, mm.loss, mm.accuracy], feed_dic)
        
        summary = tf.Summary(value=[
                tf.Summary.Value(tag="step_train_loss", simple_value=loss),
                tf.Summary.Value(tag="step_train_acc", simple_value=acc),
            ])
        
        summary_writer.add_summary(summary, step)    
        sum_loss += loss
        sum_acc += acc

        if i % 10 == 9:
            sum_loss = sum_loss / 10
            sum_acc = sum_acc / 10
            ret_acc = sum_acc
            print(get_curtime() + " Step: " + str(step) + " Training loss: " + str(sum_loss) + " accuracy: " + str(sum_acc))
            logger.info(get_curtime() + " Step: " + str(step) + " Training loss: " + str(sum_loss) + " accuracy: " + str(sum_acc))
            if sum_acc > t_acc:
                break
            sum_acc = 0.0
            sum_loss = 0.0
        if i % 1000 == 999:
            save_as = '%s/epoch%03d_%.4f.model' % (model_dir, epoch, avg_train_loss)
            saver.save(session, save_as)
            print('Saved char model', save_as)
    print(get_curtime() + " Train df Model End.")
    logger.info(get_curtime() + " Train df Model End.")
    return ret_acc        

In [3]:
# reuse model to train RDMModel
gpu_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
# gpu_config.gpu_options.per_process_gpu_memory_fraction = 0.8
device = "/CPU:0"
# device = "/GPU:0"
with tf.Graph().as_default() as g:
    with tf.Session(graph=g, config=gpu_config) as sess:
        with tf.device('/GPU:0'):
            w2v = lstm_char_cnn.WordEmbedding(
                            max_word_length = FLAGS.max_char_num , 
                            char_vocab_size = char_vocab.size, 
                            char_embed_size = FLAGS.char_embed_size, 
                            kernels = eval(FLAGS.kernels), 
                            kernel_features = eval(FLAGS.kernel_features), 
                            num_highway_layers = FLAGS.highway_layers,
                            embedding_dim = FLAGS.embedding_dim
                        )
            lstm_lm = lstm_char_cnn.LSTM_LM(
                        batch_size = FLAGS.batch_size, 
                        num_unroll_steps = FLAGS.max_sent_len, 
                        rnn_size = FLAGS.embedding_dim, 
                        num_rnn_layers = FLAGS.rnn_layers, 
                        word_vocab_size = word_vocab.size
                    )

            char_train_graph = lstm_char_cnn.infer_train_model(
                                w2v, lstm_lm, 
                                batch_size = FLAGS.batch_size, 
                                num_unroll_steps = FLAGS.max_sent_len, 
                                max_word_length = FLAGS.max_char_num, 
                                learning_rate = FLAGS.learning_rate,
                                max_grad_norm = FLAGS.max_grad_norm
                             )
#             s_model = model.SentiModel(FLAGS.hidden_dim, 5)
#             senti_train_graph = model.InferSentiTrainGraph(
#                                     w2v, 
#                                     lstm_lm, 
#                                     s_model, 
#                                     batchsize=20,
#                                     max_word_num = sentiReader.max_sent_len, 
#                                     max_char_num = FLAGS.max_char_num, 
#                                     hidden_dim = FLAGS.hidden_dim, 
#                                     sent_num = FLAGS.sent_num,
#                                     embedding_dim = FLAGS.embedding_dim
#                                 )
            val_list1 = tf.global_variables()
            saver = tf.train.Saver(val_list1, max_to_keep=4)
            sess.run(tf.variables_initializer(val_list1))
            checkpoint = tf.train.get_checkpoint_state("lstmCharCNNModel/")
            if checkpoint and checkpoint.model_checkpoint_path:
                saver.restore(sess, checkpoint.model_checkpoint_path)
            #RDMModel
            rdm_model = model.RDM_Model(
                    max_seq_len = FLAGS.max_seq_len, 
                    max_word_num = FLAGS.max_sent_len, 
                    embedding_dim = FLAGS.embedding_dim, 
                    hidden_dim = FLAGS.hidden_dim
                )
            rdm_train_graph = InferRDMTrainGraph(
                            w2v, lstm_lm, None, rdm_model, 
                            batchsize=5,
                            max_seq_len = FLAGS.max_seq_len, 
                            max_word_num = FLAGS.max_sent_len, 
                            max_char_num = FLAGS.max_char_num, 
                            hidden_dim = FLAGS.hidden_dim, 
                            embedding_dim = FLAGS.embedding_dim,
                            class_num = FLAGS.class_num
                    )
            val_list2 = tf.global_variables()
            saver2 = tf.train.Saver(val_list2, max_to_keep=4)
            uninitialized_vars = list( filter(lambda var: var not in val_list1, val_list2) )
#             print("uninitialized_vars:", uninitialized_vars)
            sess.run(tf.variables_initializer(uninitialized_vars))
            sess.run(tf.global_variables_initializer())
        summary_writer = tf.summary.FileWriter("RDMGPUTrain/", graph=sess.graph)
        TrainRDMModel(sess, saver, summary_writer, logger, rdm_train_graph, 5, 0.9, 100000, "RDMGPUTrain/", new_data_len=[])


input_: Tensor("input:0", shape=(20, 31, 21), dtype=int32, device=/device:GPU:0)
input_cnn: Tensor("Embedding_1/CNN_OUT/add_7:0", shape=(620, 1100), dtype=float32, device=/device:GPU:0)
x_reshape: Tensor("Reshape_1:0", shape=(505, 31, 21), dtype=int32, device=/device:GPU:0)
input_: Tensor("Reshape_1:0", shape=(505, 31, 21), dtype=int32, device=/device:GPU:0)
input_cnn: Tensor("Embedding_2/CNN_OUT/add_7:0", shape=(15655, 1100), dtype=float32, device=/device:GPU:0)
x_embedding: Tensor("Embedding_2/CNN_OUT/add_7:0", shape=(15655, 1100), dtype=float32, device=/device:GPU:0)
cnn_outs: Tensor("Reshape_2:0", shape=(505, 31, 1100), dtype=float32, device=/device:GPU:0)
RDM words_embedding: Tensor("transpose:0", shape=(505, 31, 300), dtype=float32, device=/device:GPU:0)
x_senti: Tensor("Train_RDM/dense/Sigmoid:0", shape=(505, 64), dtype=float32, device=/device:GPU:0)
Unknown char: 😞
Word: crash😞thoughts
################step 0: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/devi

I0901 12:18:23.005650 139700702275392 logger.py:24] 2019-09-01 12:18:23 Step: 10 Training loss: 0.7795779228210449 accuracy: 0.5600000143051147


2019-09-01 12:18:23 Step: 10 Training loss: 0.7795779228210449 accuracy: 0.5600000143051147
################step 10: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: é
Word: gérard
################step 11: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: é
Word: l'amérique
################step 12: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
################step 13: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
################step 14: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: é
Word: béni
Unknown char: ê
Word: même
################step 15: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: ã
Word: pensã©es
Unknown char: ©
Word: pensã©es
################step 16: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Un

I0901 12:18:26.484260 139700702275392 logger.py:24] 2019-09-01 12:18:26 Step: 20 Training loss: 0.7528831422328949 accuracy: 0.5000000104308129


2019-09-01 12:18:26 Step: 20 Training loss: 0.7528831422328949 accuracy: 0.5000000104308129
################step 20: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
################step 21: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: …
Word: me…thats
################step 22: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
################step 23: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: é
Word: engagée
Unknown char: é
Word: décrit
Unknown char: è
Word: très
Unknown char: é
Word: occupée
################step 24: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: é
Word: l'amérique
Unknown char: ü
Word: dürfte
Unknown char: ü
Word: für
Unknown char: ô
Word: tôt
Unknown char: è
Word: hypothèse
Unknown char: é
Word: crée
Unknown char: é
Word: déj
Unknown char: è
Word: après
Unknown char: 

I0901 12:18:29.898418 139700702275392 logger.py:24] 2019-09-01 12:18:29 Step: 30 Training loss: 0.6584962010383606 accuracy: 0.5800000101327896


2019-09-01 12:18:29 Step: 30 Training loss: 0.6584962010383606 accuracy: 0.5800000101327896
Unknown char: í
Word: magníficas
################step 30: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: é
Word: crémer
################step 31: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: ü
Word: gülen's
Unknown char: ̇
Word: i̇srailli
Unknown char: ü
Word: tümgeneral
Unknown char: ç
Word: koçavi
Unknown char: ü
Word: türkiye’de
Unknown char: ü
Word: türkiye’de
Unknown char: ş
Word: işi̇d
Unknown char: ̇
Word: işi̇d
Unknown char: ö
Word: teröristlerini
Unknown char: ğ
Word: eğiten
Unknown char: ı
Word: sayıda
Unknown char: ̇
Word: i̇slam
Unknown char: ı
Word: adına
Unknown char: ö
Word: terörün
Unknown char: ü
Word: terörün
Unknown char: ı
Word: saldırısında
Unknown char: ı
Word: saldırısında
Unknown char: ı
Word: saldırısında
Unknown char: ı
Word: hayatın
Unknown char: ü
Word: müslüman
Unknown char: 

I0901 12:18:33.354232 139700702275392 logger.py:24] 2019-09-01 12:18:33 Step: 40 Training loss: 0.6275402277708053 accuracy: 0.7400000154972076


2019-09-01 12:18:33 Step: 40 Training loss: 0.6275402277708053 accuracy: 0.7400000154972076
Unknown char: í
Word: vía
################step 40: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: é
Word: l'amérique
################step 41: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
################step 42: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
################step 43: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
################step 44: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char: í
Word: vía
Unknown char: ó
Word: situación
Unknown char: í
Word: parís
Unknown char: é
Word: rehén
Unknown char: í
Word: estadísticas
Unknown char: ó
Word: discriminación
Unknown char: á
Word: bán
################step 45: Tensor("input_x:0", shape=(5, 101, 31, 21), dtype=int32, device=/device:GPU:0)
Unknown char:

KeyboardInterrupt: 