In [8]:
import tensorflow as tf
import numpy as np
import os
import data_helpers
from tensorflow.contrib import learn
import csv
import codecs
import jieba

In [2]:
# Parameters
# ==================================================

# Data Parameters
tf.flags.DEFINE_string("positive_data_file", "./data/target.txt", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/non_target.txt", "Data source for the negative data.")

# Eval Parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
tf.flags.DEFINE_boolean("eval_train", False, "Evaluate on all training data")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

tf.flags.DEFINE_string('f', '', 'kernel')

FLAGS = tf.flags.FLAGS

In [3]:
def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(codecs.open(positive_data_file, "r", "utf-8").readlines())
    positive_examples = [[item for item in jieba.analyse.extract_tags(s,withWeight=False,topK=20,allowPOS=('n','v','nt','vn'))] for s in positive_examples]
    negative_examples = list(codecs.open(negative_data_file, "r", "utf-8").readlines())
    negative_examples = [[item for item in jieba.analyse.extract_tags(s, withWeight=False,topK=20,allowPOS=('n','v','nt','vn'))] for s in negative_examples]
    
    # Combine lists
    x_text = positive_examples + negative_examples
    
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

In [4]:
print(FLAGS.append_flag_values)

<bound method FlagValues.append_flag_values of <absl.flags._flagvalues.FlagValues object at 0x123488d68>>


In [132]:
%%time
x_raw, y = load_data_and_labels(FLAGS.positive_data_file,FLAGS.negative_data_file)

CPU times: user 33min 18s, sys: 45.4 s, total: 34min 3s
Wall time: 34min 43s


In [133]:
x_raw[1]

['返佣',
 '平台',
 '个点',
 '点回',
 '回本',
 '批文',
 '模板',
 '成本低',
 '手续费',
 '金评',
 '参返',
 '亮点',
 '杠杆',
 '开户',
 '大图',
 '交易所',
 '国务院',
 '轮播',
 '推荐',
 '添加']

In [134]:
y

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]])

In [135]:
vocab_path = os.path.join("./runs/1531822384/", "vocab")

In [39]:
print(vocab_path)

./runs/1531822384/vocab


In [136]:
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)

In [66]:
vocab_dict = vocab_processor.vocabulary_._mapping

In [72]:
y_test = np.argmax(y, axis=1)

In [73]:
y_test

array([1, 1, 1, ..., 0, 0, 0])

## Load word2vector

In [97]:
def loadWord2Vector(filename):
    
    '''
    Word2Vector_vocab – list of the words that we now have embeddings for
    Word2Vector_embed – list of lists containing the embedding vectors
    embedding_dict – dictionary where the words are the keys and the embeddings are the values
    '''
    
    Word2Vector_vocab = []
    Word2Vector_embed=[]
    embedding_dict = {}

    with open(filename,'r') as file:
        for line in file.readlines():
            row = line.strip().split(' ')
            vocab_word = row[0]
            Word2Vector_vocab.append(vocab_word)
            embed_vector = [float(i) for i in row[1:]] # convert to list of float
            embedding_dict[vocab_word]=embed_vector
            Word2Vector_embed.append(embed_vector)
            
        print('Word2Vector Loaded Successfully')
        return Word2Vector_vocab,Word2Vector_embed,embedding_dict

In [99]:
%%time
Word2Vector_vocab,Word2Vector_embed,embedding_dict = loadWord2Vector(filename = "./embd/sgns.sogounews.bigram-char")

Word2Vector Loaded Successfully
CPU times: user 55.1 s, sys: 29.9 s, total: 1min 25s
Wall time: 1min 38s


In [94]:
vocab_dict = vocab_processor.vocabulary_._mapping

In [95]:
vocab_dict['消费']

782

In [103]:
# Build Embedding array
doc_vocab_size = len(vocab_processor.vocabulary_)

# Extract word:id mapping from the object.
vocab_dict = vocab_processor.vocabulary_._mapping

# Sort the vocabulary dictionary on the basis of values(id).
# Both statements perform same task.
# sorted_vocab = sorted(vocab_dict.items(), key=operator.itemgetter(1))
dict_as_list = sorted(vocab_dict.items(), key=lambda x: x[1])

embeddings_tmp = []

for i in range(doc_vocab_size):
    item = dict_as_list[i][0]
    if item in Word2Vector_vocab:
        embeddings_tmp.append(embedding_dict[item])
    else:
        rand_num = np.random.uniform(low=-0.2, high=0.2, size=300)
        embeddings_tmp.append(rand_num)

In [105]:
embedding = np.asarray(embeddings_tmp)

In [106]:
embedding

array([[-0.02892246, -0.12089643, -0.1180916 , ...,  0.09872085,
        -0.09633021, -0.19369231],
       [-0.631278  ,  0.521143  , -1.210023  , ..., -0.019455  ,
         0.128484  ,  0.576405  ],
       [ 0.10895765, -0.16969516,  0.04487347, ..., -0.18535981,
         0.14603186,  0.09791655],
       ...,
       [-0.12706099,  0.11840921, -0.16086174, ..., -0.18615561,
         0.0952969 ,  0.05164323],
       [-0.12267822,  0.00447117, -0.17404047, ..., -0.02802657,
         0.17591435, -0.11371115],
       [-0.11995046, -0.18648953, -0.14091931, ..., -0.02468655,
        -0.16428729, -0.18802503]])

In [137]:
text_list=[]
for text in x_raw:
    text_list.append(' '.join(text))
x_test = np.array(list(vocab_processor.transform(text_list)))

In [138]:
len(x_text[1])

20

In [140]:
x_test[2]

array([41, 42, 43, 22, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 17, 54, 55,
       56, 57, 58])

In [129]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [143]:
checkpoint_file = tf.train.latest_checkpoint("./runs/1531822384/checkpoints")
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # Get the placeholders from the graph by name
        input_x = graph.get_operation_by_name("input_x").outputs[0]
        # input_y = graph.get_operation_by_name("input_y").outputs[0]
        dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name("output/predictions").outputs[0]

        # Generate batches for one epoch
        batches = batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)

        # Collect the predictions here
        all_predictions = []

        for x_test_batch in batches:
            batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
            all_predictions = np.concatenate([all_predictions, batch_predictions])

# Print accuracy if y_test is defined
if y_test is not None:
    correct_predictions = float(sum(all_predictions == y_test))
    print("Total number of test examples: {}".format(len(y_test)))
    print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))

# Save the evaluation to a csv
predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
out_path = os.path.join(FLAGS.checkpoint_dir, ".", "prediction.csv")
print("Saving evaluation to {0}".format(out_path))
with open(out_path, 'w') as f:
    csv.writer(f).writerows(predictions_human_readable)

INFO:tensorflow:Restoring parameters from /Users/zhangnan/Rebate/runs/1531822384/checkpoints/model-27400
Total number of test examples: 10922
Accuracy: 0.996704
Saving evaluation to ./prediction.csv


In [144]:
print(out_path)

./prediction.csv


In [146]:
all_predictions.shape

(10922,)