In [1]:
################################################################################
################################################################################
##

# https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py

################################################################################
################################################################################
## plan

# 1. open dataset
# 2. clean dataset
# 3. munge dataset
# 4. make test train
# 5. make constants
# 6. make placeholders
# 7. define network
# 8. define loss
# 9. define train_op
# 10. define development_op
# 11. seesion: epochs, batches


In [2]:
######################################################
######################################################
## 0. admin

from datetime import datetime

################################################################################
################################################################################
##

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")

ROOT_LOG_DIRECTORY = "../output/tf-logs"
LOG_DIRECTORY = "{}/run-{}/".format(ROOT_LOG_DIRECTORY, now)
print("Writing to {}\n".format(LOG_DIRECTORY))

################################################################################
################################################################################
##

def make_idx_batches(data, batch_size):
    idx_array = np.arange(data.shape[0])
    n = len(idx_array)
    res = []
    for i in range(0, n, batch_size):
        res.append(idx_array[i:(i+batch_size)])
    return res

Writing to ../output/tf-logs/run-20180127111344/



In [3]:
################################################################################
################################################################################
## 1. open dataset

import pandas as pd

################################################################################
################################################################################
##

response_df = pd.read_pickle("../output/response_df.pkl")
train_df = pd.read_pickle("../output/train_df.pkl")


In [4]:
################################################################################
################################################################################
## 2. clean dataset
## 3. munge dataset

import re
import numpy as np

################################################################################
################################################################################
##

def clean_string(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'m", " \'m", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

comment_text_list = train_df['comment_text'].str.lower().tolist()
comment_text_list_clean = [clean_string(x) for x in comment_text_list]

toxic_list = train_df['toxic'].tolist()
toxic_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in toxic_list])

severe_toxic_list = train_df['severe_toxic'].tolist()
severe_toxic_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in severe_toxic_list])

obscene_list = train_df['obscene'].tolist()
obscene_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in obscene_list])

threat_list = train_df['threat'].tolist()
threat_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in threat_list])

insult_list = train_df['insult'].tolist()
insult_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in insult_list])

identity_hate_list = train_df['identity_hate'].tolist()
identity_hate_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in identity_hate_list])


In [5]:
################################################################################
################################################################################
## 

max_document_length = max([len(x.split(" ")) for x in comment_text_list_clean])
MAX_DOCUMENT_LENGTH = int(max_document_length/7)
MIN_FREQUENCY = 10

################################################################################
################################################################################

In [6]:
################################################################################
################################################################################
## 

from tensorflow.contrib.learn import preprocessing

################################################################################
################################################################################
## 

vocabulary_processor = preprocessing.VocabularyProcessor(
    max_document_length = MAX_DOCUMENT_LENGTH,
    min_frequency = MIN_FREQUENCY
)

x_comment_text = np.array(list(vocabulary_processor.fit_transform(comment_text_list_clean)))


  from ._conv import register_converters as _register_converters


In [7]:
################################################################################
################################################################################
## 4. make test train

np.random.seed(10)

train_proportion = 0.8
shuffled_idx = np.random.permutation(np.arange(len(x_comment_text)))

train_idx = np.random.choice(shuffled_idx, size = int(train_proportion * len(shuffled_idx)), replace = False)
development_idx = np.array([x for x in shuffled_idx if not x in train_idx])

x_comment_text_train = x_comment_text[train_idx]

toxic_labels_train = toxic_labels[train_idx]
severe_toxic_labels_train = severe_toxic_labels[train_idx]
obscene_labels_train = obscene_labels[train_idx]
threat_labels_train = threat_labels[train_idx]
insult_labels_train = insult_labels[train_idx]
identity_hate_labels_train = identity_hate_labels[train_idx]

x_comment_text_development = x_comment_text[development_idx]

toxic_labels_development = toxic_labels[development_idx]
severe_toxic_labels_development = severe_toxic_labels[development_idx]
obscene_labels_development = obscene_labels[development_idx]
threat_labels_development = threat_labels[development_idx]
insult_labels_development = insult_labels[development_idx]
identity_hate_labels_development = identity_hate_labels[development_idx]


In [8]:
################################################################################
################################################################################
## constants

SEQUENCE_LENGTH = MAX_DOCUMENT_LENGTH
NUM_CLASSES = toxic_labels_train.shape[1]

################################################################################
################################################################################

In [9]:
################################################################################
################################################################################
## 

import tensorflow as tf

################################################################################
################################################################################
## 6. make placeholders

tf.reset_default_graph()
graph = tf.Graph()

with graph.as_default():
    
    comment_text_placeholder = tf.placeholder(tf.int32, shape = (None, SEQUENCE_LENGTH))

    toxic_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
    severe_toxic_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
    obscene_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
    threat_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
    insult_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
    identity_hate_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))

################################################################################
################################################################################

In [10]:
################################################################################
################################################################################
## train parameters

BATCH_SIZE = 50
NUM_EPOCHS = 100
NUM_DISPLAY_STEPS = 100
LEARNING_RATE = 0.001

################################################################################
################################################################################

In [11]:
################################################################################
################################################################################
##

VOCABULARY_SIZE = len(vocabulary_processor.vocabulary_)
EMBEDDING_SIZE = 50
NUM_FILTERS = 10
KERNEL_SIZE = 1
FILTER_SHAPE = [KERNEL_SIZE, EMBEDDING_SIZE, NUM_FILTERS]
POOLING_SIZE = SEQUENCE_LENGTH - KERNEL_SIZE + 1
STRIDE = 1
NUM_HIDDEN_1 = int(NUM_FILTERS/2)
NUM_CLASSES = 2

################################################################################
################################################################################

In [12]:

################################################################################
################################################################################
## 7. define network

def make_hidden_1(inputs, name):
     return tf.layers.dense(
        inputs = inputs, 
        units = NUM_UNITS_HIDDEN_1,
        name = name
     )
    
with graph.as_default():

    with tf.variable_scope("network", reuse = tf.AUTO_REUSE):

        embedding_coefficients = tf.Variable(
            initial_value = tf.random_uniform([VOCABULARY_SIZE, EMBEDDING_SIZE], -1.0, 1.0), 
            name = "embedding_coefficients"
        )
        word_embeddings = tf.nn.embedding_lookup(embedding_coefficients, comment_text_placeholder)
        # shape: [batch, SEQUENCE_LENGTH, EMBEDDING_SIZE]

        filter_1_coefficients = tf.Variable(
            initial_value = tf.truncated_normal(FILTER_SHAPE, stddev = 0.1), 
            name = "filter_1_coefficients"
        )
        filter_1 = tf.nn.conv1d(
            value = word_embeddings,
            filters = filter_1_coefficients,
            stride = STRIDE,
            padding = "VALID",
            name = "filter_1"
        )

        filter_1_bias = tf.Variable(tf.constant(0.1, shape = [NUM_FILTERS]), name = "filter_1_bias")
        # initialiser?
        convolution_1 = tf.nn.relu(tf.nn.bias_add(filter_1, filter_1_bias), name = "convolution_1")
        
        # print(convolution_1.shape) # [batch, SEQUENCE_LENGTH - KERNEL_SIZE + 1, NUM_FILTERS]
        
        max_pooling1d = tf.nn.pool(
            input = convolution_1,
            window_shape = [POOLING_SIZE],
            pooling_type = "MAX",
            strides = [1],
            padding = 'VALID',
            name = "max_pooling1d"
        )
        # print(max_pooling1d.shape) # [batch, 1, NUM_FILTERS]
        
        max_pooling1d_flattened = tf.contrib.layers.flatten(max_pooling1d)
        # print(max_pooling1d_flattened.shape) # [batch, 1, NUM_FILTERS]
        
        #toxic_dense = tf.layers.dense(
        #    inputs = max_pooling1d_flattened, 
        #    units = NUM_HIDDEN_1,
        #    activation = tf.nn.relu
        #)
        
        #toxic_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "toxic_dense")
        #severe_toxic_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "severe_toxic_dense")
        #obscene_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "obscene_dense")
        #threat_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "threat_dense")
        #insult_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "insult_dense")
        #identity_hate_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "identity_hate_dense")

        # initialiser?
        toxic_logits =  tf.layers.dense(
            inputs = max_pooling1d_flattened, 
            units = NUM_CLASSES,
            name = "toxic_logits"
        )
        #severe_toxic_logits = make_logits(inputs = toxic_dense, name = "severe_toxic_logits")
        #obscene_logits = make_logits(inputs = toxic_dense, name = "obscene_logits")
        #threat_logits = make_logits(inputs = toxic_dense, name = "threat_logits")
        #insult_logits = make_logits(inputs = toxic_dense, name = "insult_logits")
        #identity_hate_logits = make_logits(inputs = toxic_dense, name = "identity_hate_logits")
    

In [13]:
################################################################################
################################################################################
## 8. define loss

def make_single_loss(logits, placeholder, name):
    sigmoid_cross_entropy_op = tf.nn.sigmoid_cross_entropy_with_logits(
            logits = logits, 
            labels = placeholder
        )
    return tf.reduce_mean(sigmoid_cross_entropy_op, name = name)

with graph.as_default():

    with tf.name_scope("loss"):

        toxic_loss_op = make_single_loss(toxic_logits, toxic_placeholder, "toxic_loss_op")
        #severe_toxic_loss_op = make_single_loss(severe_toxic_logits, severe_toxic_placeholder, "severe_toxic_loss_op")
        #obscene_loss_op = make_single_loss(obscene_logits, obscene_placeholder, "obscene_loss_op")
        #threat_loss_op = make_single_loss(threat_logits, threat_placeholder, "threat_loss_op")
        #insult_loss_op = make_single_loss(insult_logits, insult_placeholder, "insult_loss_op")
        #identity_hate_loss_op = make_single_loss(identity_hate_logits, identity_hate_placeholder, "identity_hate_loss_op")

        loss_op = tf.reduce_mean([toxic_loss_op])
                                  #, severe_toxic_loss_op, obscene_loss_op, 
        #                          threat_loss_op, insult_loss_op, identity_hate_loss_op])

        toxic_loss_summary_op = tf.summary.scalar("toxic_loss_op", toxic_loss_op)
        #severe_toxic_loss_op_summary = tf.summary.scalar("severe_toxic_loss_op", severe_toxic_loss_op)
        #obscene_loss_op_summary = tf.summary.scalar("obscene_loss_op", obscene_loss_op)
        #threat_loss_op_summary = tf.summary.scalar("threat_loss_op", threat_loss_op)
        #insult_loss_op_summary = tf.summary.scalar("insult_loss_op", insult_loss_op)
        #identity_hate_loss_op_summary = tf.summary.scalar("identity_hate_loss_op", identity_hate_loss_op)

        loss_summary_op = tf.summary.scalar("loss_op", loss_op)



In [14]:
################################################################################
################################################################################
## 

import time, os

################################################################################
################################################################################
## 9. define train_op

with graph.as_default():

    with tf.variable_scope("train", reuse = tf.AUTO_REUSE):

        global_step = tf.Variable(0, name = "global_step", trainable = False)
        optimizer = tf.train.AdamOptimizer(learning_rate = LEARNING_RATE)
        grads_and_vars = optimizer.compute_gradients(loss_op)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step = global_step)
        
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Train Summaries
        train_summary_op = tf.summary.merge([toxic_loss_summary_op, loss_summary_op, grad_summaries_merged])
        train_summary_dir = os.path.join(LOG_DIRECTORY, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, tf.get_default_graph())

        # Dev summaries
        development_summary_op = tf.summary.merge([toxic_loss_summary_op, loss_summary_op])
        development_summary_dir = os.path.join(LOG_DIRECTORY, "summaries", "development")
        development_summary_writer = tf.summary.FileWriter(development_summary_dir, tf.get_default_graph())
    
################################################################################
################################################################################

INFO:tensorflow:Summary name network/embedding_coefficients:0/grad/hist is illegal; using network/embedding_coefficients_0/grad/hist instead.
INFO:tensorflow:Summary name network/embedding_coefficients:0/grad/sparsity is illegal; using network/embedding_coefficients_0/grad/sparsity instead.
INFO:tensorflow:Summary name network/filter_1_coefficients:0/grad/hist is illegal; using network/filter_1_coefficients_0/grad/hist instead.
INFO:tensorflow:Summary name network/filter_1_coefficients:0/grad/sparsity is illegal; using network/filter_1_coefficients_0/grad/sparsity instead.
INFO:tensorflow:Summary name network/filter_1_bias:0/grad/hist is illegal; using network/filter_1_bias_0/grad/hist instead.
INFO:tensorflow:Summary name network/filter_1_bias:0/grad/sparsity is illegal; using network/filter_1_bias_0/grad/sparsity instead.
INFO:tensorflow:Summary name network/toxic_logits/kernel:0/grad/hist is illegal; using network/toxic_logits/kernel_0/grad/hist instead.
INFO:tensorflow:Summary name

In [15]:
################################################################################
################################################################################
##

from timeit import default_timer

################################################################################
################################################################################
## session

with graph.as_default():

    with tf.Session() as session:

        session.run(tf.global_variables_initializer())
        
        start_of_step_time = default_timer()
        
        patience_counter = 0
        
        for epoch in range(NUM_EPOCHS):

            idx_batches = make_idx_batches(x_comment_text_train, BATCH_SIZE)
            
            development_loss_last = 100

            for idx_batch in idx_batches:

                x_comment_text_batch = x_comment_text_train[idx_batch]

                toxic_labels_batch = toxic_labels_train[idx_batch]
                #severe_toxic_labels_batch = severe_toxic_labels_train[idx_batch]
                #obscene_labels_batch = obscene_labels_train[idx_batch]
                #threat_labels_batch = threat_labels_train[idx_batch]
                #insult_labels_batch = insult_labels_train[idx_batch]
                #identity_hate_labels_batch = identity_hate_labels_train[idx_batch]

                # train
                train_feed_dict = {
                    comment_text_placeholder: x_comment_text_batch,
                    toxic_placeholder: toxic_labels_batch,
                 #   severe_toxic_placeholder: severe_toxic_labels_batch, 
                 #   obscene_placeholder: obscene_labels_batch,
                 #   threat_placeholder: threat_labels_batch,
                 #   insult_placeholder: insult_labels_batch,
                 #   identity_hate_placeholder: identity_hate_labels_batch
                }

                _, train_loss, train_summary = session.run([train_op, loss_op, train_summary_op], feed_dict = train_feed_dict)

                current_step = tf.train.global_step(session, global_step)
                
                #train_summary_writer.add_summary(train_summary, current_step)

                if current_step % NUM_DISPLAY_STEPS == 0:
                    
                    took = default_timer() - start_of_step_time
                    start_of_step_time = default_timer()

                    development_feed_dict = {
                        comment_text_placeholder: x_comment_text_development,
                        toxic_placeholder: toxic_labels_development,
                    #    severe_toxic_placeholder: severe_toxic_labels_development, 
                    #    obscene_placeholder: obscene_labels_development,
                    #    threat_placeholder: threat_labels_development,
                    #    insult_placeholder: insult_labels_development,
                    #    identity_hate_placeholder: identity_hate_labels_development
                    }

                    development_loss, development_summary = session.run(
                        fetches = [loss_op, development_summary_op], 
                        feed_dict = development_feed_dict
                    )
                    
                    print("train_loss {:g}, development_loss {:g}, current_step {:g}, took {:g}".format(
                        train_loss, development_loss, current_step, took))
                        
                    if development_loss - development_loss_last > 0:
                        if patience_counter > 10: 
                            break
                        patience_counter = patience_counter + 1
                    else:
                        patience_counter = 0
                        
                    development_loss_last = development_loss
                    
                    #development_summary_writer.add_summary(development_summary, current_step)
                    
            if development_loss - development_loss_last > 0: 
                break
                    
                    

                

train_loss 0.333555, development_loss 0.306562, current_step 100, took 4.00971
train_loss 0.269191, development_loss 0.296797, current_step 200, took 49.0388
train_loss 0.294905, development_loss 0.288521, current_step 300, took 14.1569
train_loss 0.34962, development_loss 0.279104, current_step 400, took 12.2418
train_loss 0.377331, development_loss 0.265336, current_step 500, took 11.4824
train_loss 0.195575, development_loss 0.248661, current_step 600, took 8.6897
train_loss 0.120533, development_loss 0.229447, current_step 700, took 8.65263
train_loss 0.222935, development_loss 0.212975, current_step 800, took 9.19015
train_loss 0.242483, development_loss 0.198044, current_step 900, took 7.69971
train_loss 0.152757, development_loss 0.185029, current_step 1000, took 8.98394
train_loss 0.0855937, development_loss 0.176606, current_step 1100, took 8.11932
train_loss 0.201223, development_loss 0.169881, current_step 1200, took 7.86171
train_loss 0.0567818, development_loss 0.164634, c

train_loss 0.0390506, development_loss 0.122527, current_step 10400, took 6.85071
train_loss 0.083262, development_loss 0.122244, current_step 10500, took 7.32902
train_loss 0.233549, development_loss 0.122661, current_step 10600, took 7.3616
train_loss 0.0609427, development_loss 0.122744, current_step 10700, took 7.33846
train_loss 0.0723832, development_loss 0.123439, current_step 10800, took 9.63326
train_loss 0.0886111, development_loss 0.125043, current_step 10900, took 9.10025
train_loss 0.0562639, development_loss 0.123709, current_step 11000, took 6.93142
train_loss 0.059766, development_loss 0.124836, current_step 11100, took 8.07963
train_loss 0.169509, development_loss 0.123941, current_step 11200, took 7.61744
train_loss 0.0857117, development_loss 0.124443, current_step 11300, took 7.20486
train_loss 0.0880783, development_loss 0.124571, current_step 11400, took 8.33859
train_loss 0.0284383, development_loss 0.123836, current_step 11500, took 9.27781
train_loss 0.0220443,

train_loss 0.110299, development_loss 0.127041, current_step 20500, took 7.69723
train_loss 0.17482, development_loss 0.126083, current_step 20600, took 7.76685
train_loss 0.153302, development_loss 0.128197, current_step 20700, took 8.61365
train_loss 0.164496, development_loss 0.125952, current_step 20800, took 9.03077
train_loss 0.0179986, development_loss 0.126064, current_step 20900, took 9.39474
train_loss 0.0736792, development_loss 0.126945, current_step 21000, took 7.5707
train_loss 0.0607111, development_loss 0.130311, current_step 21100, took 7.02543
train_loss 0.0315047, development_loss 0.128306, current_step 21200, took 7.03947
train_loss 0.0709904, development_loss 0.127133, current_step 21300, took 7.1373
train_loss 0.111335, development_loss 0.127741, current_step 21400, took 7.33971
train_loss 0.064588, development_loss 0.127528, current_step 21500, took 11.9886
train_loss 0.0178389, development_loss 0.12869, current_step 21600, took 8.70554
train_loss 0.113009, devel

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-b500f190d9df>", line 49, in <module>
    _, train_loss, train_summary = session.run([train_op, loss_op, train_summary_op], feed_dict = train_feed_dict)
  File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 895, in run
    run_metadata_ptr)
  File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1128, in _run
    feed_dict_tensor, options, run_metadata)
  File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1344, in _do_run
    options, run_metadata)
  File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1350, in _do_call
    return fn(*args)
  File "/opt/conda/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 13

KeyboardInterrupt: 