In [1]:
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import helper
from text_cnn import TextCNN
from tensorflow.contrib import learn
import csv
import sys
from sklearn import metrics
import yaml
import pandas as pd


In [2]:
# Parameters
# ==================================================

# Data Parameters
#tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
#tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.")

# Eval Parameters
#tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 
tf.flags.DEFINE_string("checkpoint_dir", "./runs/1545086818/checkpoints/", "Checkpoint directory from training run")
tf.flags.DEFINE_boolean("eval_train", True, "Evaluate on all training data")
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_boolean("enable_word_embeddings", True, "Enable/disable the word embedding (default: True)")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


tf.app.flags.DEFINE_string('f', '', 'kernel')
FLAGS = tf.flags.FLAGS
FLAGS(sys.argv)
print("\nParameters:")

for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

#load config file
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.load(ymlfile)


if FLAGS.enable_word_embeddings and cfg['word_embeddings']['default'] is not None:
    embedding_name = cfg['word_embeddings']['default']
    embedding_dimension = cfg['word_embeddings'][embedding_name]['dimension']
else:
    embedding_dimension = FLAGS.embedding_dim



Parameters:
ALLOW_SOFT_PLACEMENT=<absl.flags._flag.BooleanFlag object at 0x7f1906e3d4e0>
CHECKPOINT_DIR=<absl.flags._flag.Flag object at 0x7f1906e3d1d0>
EMBEDDING_DIM=<absl.flags._flag.Flag object at 0x7f1906e3d320>
ENABLE_WORD_EMBEDDINGS=<absl.flags._flag.BooleanFlag object at 0x7f1906e3d400>
EVAL_TRAIN=<absl.flags._flag.BooleanFlag object at 0x7f1906e3d160>
F=<absl.flags._flag.Flag object at 0x7f1906e3d668>
FILTER_SIZES=<absl.flags._flag.Flag object at 0x7f1906e3d470>
LOG_DEVICE_PLACEMENT=<absl.flags._flag.BooleanFlag object at 0x7f1906e3d550>
NUM_FILTERS=<absl.flags._flag.Flag object at 0x7f1906e3d3c8>



In [3]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    if x.ndim == 1:
        x = x.reshape((1, -1))
    max_x = np.max(x, axis=1).reshape((-1, 1))
    exp_x = np.exp(x - max_x)
    return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))

In [4]:
datasets = None
# CHANGE THIS: Load data. Load your own data here
dataset_name = cfg["datasets"]["default"]
if FLAGS.eval_train:
    #x_raw, y_test = helper.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
    if dataset_name == "mrpolarity":
        datasets = helper.get_datasets_mrpolarity(cfg["datasets"][dataset_name]["positive_data_file"]["path"],
                                                        cfg["datasets"][dataset_name]["negative_data_file"]["path"])
    elif dataset_name == "20newsgroup":
        datasets = helper.get_datasets_20newsgroup(subset="test",
                                              categories=cfg["datasets"][dataset_name]["categories"],
                                              shuffle=cfg["datasets"][dataset_name]["shuffle"],
                                              random_state=cfg["datasets"][dataset_name]["random_state"])
    x_raw, y_test = helper.load_data_labels(datasets)
    num_categories = y_test.shape[1]
    y_test = np.argmax(y_test, axis=1)
    
else:
    #x_raw = ["a masterpiece four years in the making", "everything is off."]
    #y_test = [1, 0]
    if dataset_name == "mrpolarity":
        datasets = {"target_names": ['positive_examples', 'negative_examples']}
        x_raw = ["a masterpiece four years in the making", "everything is off."]
        y_test = [1, 0]
        num_categories = 2
    else:
        datasets = {"target_names": ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']}
        x_raw = ["The number of reported cases of gonorrhea in Colorado increased",
                 "I am in the market for a 24-bit graphics card for a PC"]
        y_test = [2, 1]
        num_categories = 4

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))


Instructions for updating:
Please use tensorflow/transform or tf.data.


In [5]:
print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
gpu_options=tf.GPUOptions()
gpu_options.allow_growth = True
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement,
      gpu_options=gpu_options)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # Get the placeholders from the graph by name
        input_x = graph.get_operation_by_name("input_x").outputs[0]
        input_y = graph.get_operation_by_name("input_y").outputs[0]
        dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name("output/predictions").outputs[0]
        scores = graph.get_operation_by_name("output/scores").outputs[0]
        embeddings = graph.get_operation_by_name("embedding/embedded_chars").outputs[0]
        pool_convs = graph.get_operation_by_name("pool/h_pool_flat").outputs[0]

        # Generate batches for one epoch
        batches = helper.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)

        # Collect the predictions here
        total_filters = len(FLAGS.filter_sizes.split(',')) * FLAGS.num_filters
        all_predictions = []
        data_out=np.empty([64,3])
        all_probabilities = None
      
        all_scores = np.zeros((FLAGS.batch_size, num_categories))
        all_embeddings = np.zeros((FLAGS.batch_size, int(input_x.shape[1] * embedding_dimension)))
        all_pool_convs = np.zeros((FLAGS.batch_size, total_filters))

        #pd_data = pd.DataFrame(columns=['embeddings', 'pool_convs','scores'])
        pd_data = pd.DataFrame()
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "output"))
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)
            
        print("Ready for the run...")    

        for x_test_batch in batches:
            batch_predictions, batch_scores, batch_embeddings, batch_pool_convs = sess.run([predictions, scores, embeddings, pool_convs], {input_x: x_test_batch, dropout_keep_prob: 1.0})
            
            #flatten arrays            
            batch_embeddings_flat = batch_embeddings.reshape(batch_embeddings.shape[0],-1)
            #print("embeddings size:",  batch_embeddings_flat.shape)
            batch_scores_flat = batch_scores.reshape(batch_scores.shape[0],-1)
            #print("scores size:",  batch_scores_flat.shape)
            batch_pool_convs_flat = batch_pool_convs.reshape(batch_pool_convs.shape[0],-1)
            #print("pool_convs size:",  batch_pool_convs_flat.shape)
            
            #Padding
            pad = np.zeros((batch_embeddings_flat.shape))
            pool_conv_padded = pad
            pool_conv_padded[0:batch_pool_convs_flat.shape[0],0:batch_pool_convs_flat.shape[1]] = batch_pool_convs_flat
            scores_padded = pad
            scores_padded[0:batch_scores_flat.shape[0],0:batch_scores_flat.shape[1]] = batch_scores_flat
            
            
            #Write to batch file .csv
            time_str = datetime.datetime.now().isoformat()
            file_name = "output_data" + time_str + ".csv"
            out_file_path = os.path.join(out_dir, file_name)
            data = np.hstack((batch_embeddings_flat, pool_conv_padded, scores_padded))
            print(data.shape)
            #pd_data = pd.DataFrame.from_records(data, columns=["embeddings", "pool_conv", "scores"])
            #data_dict = {'embeddings': batch_embeddings_flat,'pool_convs': pool_conv_padded,
            #                                      'scores': scores_padded}
            #pd_data= pd.DataFrame.from_dict(data_dict)
            
            #pd_data.to_csv(path_or_buf=out_file_path, sep=',', float_format=None, header=True, index=True, 
            #               mode='w', line_terminator='\n')
            np.savetxt(out_file_path, data, delimiter=',')
            
            all_predictions = np.concatenate([all_predictions, batch_predictions])
            probabilities = softmax(batch_scores)
            
            if all_probabilities is not None:
                all_probabilities = np.concatenate([all_probabilities, probabilities])
            else:
                all_probabilities = probabilities



Evaluating...



NotFoundError: /home/maria/TextGAN/cnn-text-classification/My_Text_Classification_v3/runs/1545086818/checkpoints; No such file or directory

In [None]:
print(len(data_out))

In [None]:
# Print accuracy if y_test is defined
if y_test is not None:
    correct_predictions = float(sum(all_predictions == y_test))
    print("Total number of test examples: {}".format(len(y_test)))
    print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
    print(metrics.classification_report(y_test, all_predictions, target_names=datasets['target_names']))
    print(metrics.confusion_matrix(y_test, all_predictions))

# Save the evaluation to a csv
# predictions_human_readable = np.column_stack((np.array(x_raw), all_predictions))
predictions_human_readable = np.column_stack((np.array(x_raw),
                                              [int(prediction) for prediction in all_predictions],
                                              [ "{}".format(probability) for probability in all_probabilities]))
out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
print("Saving evaluation to {0}".format(out_path))
with open(out_path, 'w') as f:
    csv.writer(f).writerows(predictions_human_readable)