In [82]:
################################################################################
################################################################################
##

# https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/convolutional_network.py

################################################################################
################################################################################
## plan

# 1. open dataset
# 2. clean dataset
# 3. munge dataset
# 4. make test train
# 5. make constants
# 6. make placeholders
# 7. define network
# 8. define loss
# 9. define train_op
# 10. define development_op
# 11. seesion: epochs, batches


In [83]:
######################################################
######################################################
## 5. make constants

from datetime import datetime

################################################################################
################################################################################
##

now = datetime.utcnow().strftime("%Y%m%d%H%M%S")

ROOT_LOG_DIRECTORY = "../output/tf-logs"
LOG_DIRECTORY = "{}/run-{}/".format(ROOT_LOG_DIRECTORY, now)

################################################################################
################################################################################
##

def make_idx_batches(data, batch_size):
    idx_array = np.arange(data.shape[0])
    n = len(idx_array)
    res = []
    for i in range(0, n, batch_size):
        res.append(idx_array[i:(i+batch_size)])
    return res

In [84]:
################################################################################
################################################################################
## 1. open dataset

import pandas as pd

################################################################################
################################################################################
##

response_df = pd.read_pickle("../output/response_df.pkl")
train_df = pd.read_pickle("../output/train_df.pkl")


In [85]:
################################################################################
################################################################################
## 2. clean dataset
## 3. munge dataset

import re
import numpy as np

################################################################################
################################################################################
##

def clean_string(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'m", " \'m", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

comment_text_list = train_df['comment_text'].str.lower().tolist()
comment_text_list_clean = [clean_string(x) for x in comment_text_list]

toxic_list = train_df['toxic'].tolist()
toxic_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in toxic_list])

severe_toxic_list = train_df['severe_toxic'].tolist()
severe_toxic_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in severe_toxic_list])

obscene_list = train_df['obscene'].tolist()
obscene_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in obscene_list])

threat_list = train_df['threat'].tolist()
threat_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in threat_list])

insult_list = train_df['insult'].tolist()
insult_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in insult_list])

identity_hate_list = train_df['identity_hate'].tolist()
identity_hate_labels = np.array([[0, 1] if x == 1 else [1, 0] for x in identity_hate_list])


In [86]:
################################################################################
################################################################################
## 

MAX_DOCUMENT_LENGTH = 2000
MIN_FREQUENCY = 10

################################################################################
################################################################################

In [87]:
################################################################################
################################################################################
## 

from tensorflow.contrib.learn import preprocessing

################################################################################
################################################################################
## 

max_document_length = max([len(x.split(" ")) for x in comment_text_list_clean])
vocabulary_processor = preprocessing.VocabularyProcessor(
    max_document_length = MAX_DOCUMENT_LENGTH,
    min_frequency = MIN_FREQUENCY
)

x_comment_text = np.array(list(vocabulary_processor.fit_transform(comment_text_list_clean)))


In [88]:
################################################################################
################################################################################
## 4. make test train

np.random.seed(10)

train_proportion = 0.8
shuffled_idx = np.random.permutation(np.arange(len(x_comment_text)))

train_idx = np.random.choice(shuffled_idx, size = int(train_proportion * len(shuffled_idx)), replace = False)
development_idx = np.array([x for x in shuffled_idx if not x in train_idx])

x_comment_text_train = x_comment_text[train_idx]

toxic_labels_train = toxic_labels[train_idx]
severe_toxic_labels_train = severe_toxic_labels[train_idx]
obscene_labels_train = obscene_labels[train_idx]
threat_labels_train = threat_labels[train_idx]
insult_labels_train = insult_labels[train_idx]
identity_hate_labels_train = identity_hate_labels[train_idx]

x_comment_text_development = x_comment_text[development_idx]

toxic_labels_development = toxic_labels[development_idx]
severe_toxic_labels_development = severe_toxic_labels[development_idx]
obscene_labels_development = obscene_labels[development_idx]
threat_labels_development = threat_labels[development_idx]
insult_labels_development = insult_labels[development_idx]
identity_hate_labels_development = identity_hate_labels[development_idx]


In [103]:
################################################################################
################################################################################
## constants

SEQUENCE_LENGTH = MAX_DOCUMENT_LENGTH
NUM_CLASSES = toxic_labels_train.shape[1]

VOCABULARY_SIZE = len(vocabulary_processor.vocabulary_)
EMBEDDING_SIZE = 10

NUM_FILTERS = 8
KERNEL_SIZE = 3
FILTER_SHAPE = [KERNEL_SIZE, EMBEDDING_SIZE, NUM_FILTERS]

STRIDE = 1

POOLING_SIZE = SEQUENCE_LENGTH - KERNEL_SIZE + 1
POOLING_WINDOW_SHAPE = [1, POOLING_SIZE, 1]

NUM_UNITS_HIDDEN_1 = 8
NUM_CLASSES = 2


In [104]:
################################################################################
################################################################################
## 

import tensorflow as tf

################################################################################
################################################################################
## 6. make placeholders

tf.reset_default_graph()

comment_text_placeholder = tf.placeholder(tf.int32, shape = (None, SEQUENCE_LENGTH))

toxic_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
severe_toxic_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
obscene_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
threat_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
insult_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))
identity_hate_placeholder = tf.placeholder(dtype = tf.float32, shape = (None, NUM_CLASSES))

################################################################################
################################################################################

In [105]:

################################################################################
################################################################################
## 7. define network

def make_hidden_1(inputs, name):
     return tf.layers.dense(
        inputs = inputs, 
        units = NUM_UNITS_HIDDEN_1,
        name = name
     )
    
def make_logits(inputs, name):
    return tf.layers.dense(
        inputs = inputs, 
        units = NUM_CLASSES,
        name = name
     )

with tf.name_scope("network"):
        
    embedding_coefficients = tf.Variable(
        tf.random_uniform([VOCABULARY_SIZE, EMBEDDING_SIZE], -1.0, 1.0), 
        name = "embedding_coefficients"
    )
    word_embeddings = tf.nn.embedding_lookup(embedding_coefficients, comment_text_placeholder)
    # shape: [batch, SEQUENCE_LENGTH, EMBEDDING_SIZE]
    
    conv1d = tf.layers.conv1d(
        inputs = word_embeddings, 
        filters = NUM_FILTERS,
        kernel_size = KERNEL_SIZE,
        strides = 1,
        padding = 'valid',
        activation = tf.nn.relu
    )

    max_pooling1d = tf.layers.max_pooling1d(
        inputs = conv1d, 
        pool_size = POOLING_SIZE,
        strides = 1,
        padding = 'valid',
    )

    # Flatten the data to a 1-D vector for the fully connected layer
    max_pooling1d_flattened = tf.contrib.layers.flatten(max_pooling1d)

    toxic_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "toxic_dense")
    #severe_toxic_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "severe_toxic_dense")
    #obscene_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "obscene_dense")
    #threat_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "threat_dense")
    #insult_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "insult_dense")
    #identity_hate_dense = make_hidden_1(inputs = max_pooling1d_flattened, name = "identity_hate_dense")
    
    toxic_logits = make_logits(inputs = toxic_dense, name = "toxic_logits")
    #severe_toxic_logits = make_logits(inputs = toxic_dense, name = "severe_toxic_logits")
    #obscene_logits = make_logits(inputs = toxic_dense, name = "obscene_logits")
    #threat_logits = make_logits(inputs = toxic_dense, name = "threat_logits")
    #insult_logits = make_logits(inputs = toxic_dense, name = "insult_logits")
    #identity_hate_logits = make_logits(inputs = toxic_dense, name = "identity_hate_logits")
    

In [106]:
################################################################################
################################################################################
## 8. define loss

def make_single_loss(logits, placeholder, name):
    sigmoid_cross_entropy_op = tf.nn.sigmoid_cross_entropy_with_logits(
            logits = logits, 
            labels = placeholder
        )
    return tf.reduce_mean(sigmoid_cross_entropy_op, name = name)

with tf.name_scope("loss"):
    
    toxic_loss_op = make_single_loss(toxic_logits, toxic_placeholder, "toxic_loss_op")
    #severe_toxic_loss_op = make_single_loss(severe_toxic_logits, severe_toxic_placeholder, "severe_toxic_loss_op")
    #obscene_loss_op = make_single_loss(obscene_logits, obscene_placeholder, "obscene_loss_op")
    #threat_loss_op = make_single_loss(threat_logits, threat_placeholder, "threat_loss_op")
    #insult_loss_op = make_single_loss(insult_logits, insult_placeholder, "insult_loss_op")
    #identity_hate_loss_op = make_single_loss(identity_hate_logits, identity_hate_placeholder, "identity_hate_loss_op")
    
    #loss_op = tf.reduce_mean([toxic_loss_op, severe_toxic_loss_op, obscene_loss_op, 
    #                          threat_loss_op, insult_loss_op, identity_hate_loss_op])
    loss_op = toxic_loss_op
    
    #toxic_loss_summary_op = tf.summary.scalar("toxic_loss_op", toxic_loss_op)
    #severe_toxic_loss_op_summary = tf.summary.scalar("severe_toxic_loss_op", severe_toxic_loss_op)
    #obscene_loss_op_summary = tf.summary.scalar("obscene_loss_op", obscene_loss_op)
    #threat_loss_op_summary = tf.summary.scalar("threat_loss_op", threat_loss_op)
    #insult_loss_op_summary = tf.summary.scalar("insult_loss_op", insult_loss_op)
    #identity_hate_loss_op_summary = tf.summary.scalar("identity_hate_loss_op", identity_hate_loss_op)

    #loss_op_summary = tf.summary.scalar("loss_op", loss_op)

#file_writer = tf.summary.FileWriter(LOG_DIRECTORY, tf.get_default_graph())


In [116]:
################################################################################
################################################################################
## train parameters

BATCH_SIZE = 2
NUM_EPOCHS = 1
NUM_DISPLAY_STEPS = 1
LEARNING_RATE = 0.001

################################################################################
################################################################################

In [117]:
################################################################################
################################################################################
## 9. define train_op

with tf.name_scope("train"):

    global_step = tf.Variable(0, name = "global_step", trainable = False)
    optimizer = tf.train.AdamOptimizer(learning_rate = LEARNING_RATE)
    grads_and_vars = optimizer.compute_gradients(loss_op)
    train_op = optimizer.apply_gradients(grads_and_vars, global_step = global_step)
    
################################################################################
################################################################################

In [None]:
################################################################################
################################################################################
##

from timeit import default_timer

################################################################################
################################################################################
## session

with tf.Session() as session:
    
    session.run(tf.global_variables_initializer())
    
    start_of_step_time = default_timer()
    
    for epoch in range(NUM_EPOCHS):
    
        idx_batches = make_idx_batches(x_comment_text_train, BATCH_SIZE)
    
        for idx_batch in idx_batches:

            x_comment_text_batch = x_comment_text_train[idx_batch]
            
            toxic_labels_batch = toxic_labels_train[idx_batch]
            #severe_toxic_labels_batch = severe_toxic_labels_train[idx_batch]
            #obscene_labels_batch = obscene_labels_train[idx_batch]
            #threat_labels_batch = threat_labels_train[idx_batch]
            #insult_labels_batch = insult_labels_train[idx_batch]
            #identity_hate_labels_batch = identity_hate_labels_train[idx_batch]
        
            # train
            train_feed_dict = {
                comment_text_placeholder: x_comment_text_batch,
                toxic_placeholder: toxic_labels_batch,
             #   severe_toxic_placeholder: severe_toxic_labels_batch, 
             #   obscene_placeholder: obscene_labels_batch,
             #   threat_placeholder: threat_labels_batch,
             #   insult_placeholder: insult_labels_batch,
             #   identity_hate_placeholder: identity_hate_labels_batch
            }
            
            _, train_loss = session.run([train_op, loss_op], feed_dict = train_feed_dict)
                                
            current_step = tf.train.global_step(session, global_step)
            
            if current_step % NUM_DISPLAY_STEPS == 0:
                
                took = default_timer() - start_of_step_time
                start_of_step_time = default_timer()

                development_feed_dict = {
                    comment_text_placeholder: x_comment_text_development,
                    toxic_placeholder: toxic_labels_development,
                #    severe_toxic_placeholder: severe_toxic_labels_development, 
                #    obscene_placeholder: obscene_labels_development,
                #    threat_placeholder: threat_labels_development,
                #    insult_placeholder: insult_labels_development,
                #    identity_hate_placeholder: identity_hate_labels_development
                }

                development_loss = session.run(fetches = loss_op, feed_dict = development_feed_dict)
                print("train_loss {:g}, development_loss {:g}, step {:g}, took {:g}".format(
                    train_loss, development_loss, current_step, took))

                

train_loss 0.353208, development_loss 0.531299, step 1, took 0.068475
train_loss 0.783387, development_loss 0.524968, step 2, took 3.4726
train_loss 0.396906, development_loss 0.516493, step 3, took 3.39062
train_loss 0.399177, development_loss 0.507225, step 4, took 3.38571
train_loss 0.340869, development_loss 0.49754, step 5, took 3.39116
train_loss 0.354436, development_loss 0.488027, step 6, took 3.44742
train_loss 0.382612, development_loss 0.47846, step 7, took 3.41393
train_loss 0.431709, development_loss 0.468877, step 8, took 3.35485
train_loss 0.888589, development_loss 0.461235, step 9, took 3.4642
train_loss 0.34681, development_loss 0.453595, step 10, took 3.40583
train_loss 0.397027, development_loss 0.446271, step 11, took 3.36699
train_loss 0.400394, development_loss 0.439066, step 12, took 3.5083
train_loss 0.436558, development_loss 0.431869, step 13, took 3.4356
train_loss 0.217016, development_loss 0.424947, step 14, took 3.40635
train_loss 0.592164, development_lo

train_loss 0.157385, development_loss 0.322877, step 119, took 4.09421
train_loss 0.78329, development_loss 0.324036, step 120, took 4.07587
train_loss 0.103062, development_loss 0.325087, step 121, took 4.08535
train_loss 0.075869, development_loss 0.326009, step 122, took 4.02704
train_loss 0.139356, development_loss 0.326745, step 123, took 4.10466
train_loss 0.102967, development_loss 0.327323, step 124, took 4.06037
train_loss 0.0962384, development_loss 0.327753, step 125, took 4.09651
train_loss 0.113105, development_loss 0.328032, step 126, took 4.04493
train_loss 0.214304, development_loss 0.328149, step 127, took 4.02875
train_loss 0.110691, development_loss 0.328138, step 128, took 4.04864
train_loss 0.0968806, development_loss 0.328016, step 129, took 4.04356
train_loss 1.18191, development_loss 0.328408, step 130, took 4.08666
train_loss 0.135343, development_loss 0.328629, step 131, took 4.06604
train_loss 0.150517, development_loss 0.328675, step 132, took 4.04341
train_