In [1]:
import os
import sys
import json
import time
import logging
import data_helper
import numpy as np
import tensorflow as tf
from text_cnn import TextCNN
from tensorflow.contrib import learn
from sklearn.model_selection import train_test_split

In [2]:
logging.getLogger().setLevel(logging.INFO)

In [3]:
# python3 train.py ./data/consumer_complaints.csv.zip ./parameters.json

"""Step 0: load sentences, labels, and training parameters"""
train_file = "./data/consumer_complaints.csv.zip"
x_raw, y_raw, df, labels = data_helper.load_data_and_labels(train_file)

parameter_file = "./parameters.json"
params = json.loads(open(parameter_file).read())

  if self.run_code(code, result):


In [4]:
x_raw

["they 've called me xxx times in 14 days and its still xxx so , i 'm sure i 'll get a xxx more today",
 "i received a letter from synchrony bank \\( formerly ge financial \\) in the letter , it lists reasons why they reduced my credit limit such as number of times credit used in 12 months , number of months credit card has been opened , amount of payment to low over 12 months , and used too much of credit limit within 6 months and , at least xxx of these reasons are false i called the phone number listed to obtain further clarification and dispute action initially , i was told to contact xxx to get further clarification after tell them , i knew my credit score then , i was told they have their only credit scoring when i asks them for further details , i was told they do n't give out information about their scoring i would also like to note i have made more than the credit payment they required every month on time they lowered my credit limit so low that i would have went over the limi

In [5]:
y_raw

[array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]),
 array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]),
 array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 

In [6]:
df

Unnamed: 0,product,consumer_complaint_narrative
282869,Debt collection,They 've called me XXXX times in 14 days ... a...
298039,Credit card,I received a letter from Synchrony Bank ( form...
258143,Debt collection,"On XXXX XXXX, 2015 XXXX I received a call from..."
267375,Debt collection,"On XX/XX/2015, I returned a call that was left..."
545967,Credit reporting,XXXX XXXX XXXX XXXX is listed as a derogatory ...
244287,Credit reporting,Equifax has maliciously misrepresenting the ab...
298571,Mortgage,Brandy from BSI Financial Services Intentional...
304247,Mortgage,House on market XXXX/XXXX/15. Offer submitted ...
272542,Bank account or service,I have an account with US Bank and my husband ...
503075,Mortgage,American Financial Resources ( AFR ) denied my...


In [7]:
labels

['Bank account or service',
 'Consumer Loan',
 'Credit card',
 'Credit reporting',
 'Debt collection',
 'Money transfers',
 'Mortgage',
 'Other financial service',
 'Payday loan',
 'Prepaid card',
 'Student loan']

In [9]:
"""Step 1: pad each sentence to the same length and map each word to an id"""
max_document_length = max([len(x.split(' ')) for x in x_raw])
logging.info('The maximum length of all sentences: {}'.format(max_document_length))
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_raw)))
y = np.array(y_raw)

INFO:root:The maximum length of all sentences: 912


In [10]:
"""Step 2: split the original dataset into train and test sets"""
x_, x_test, y_, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

In [11]:
x

array([[  1,   2,   3, ...,   0,   0,   0],
       [ 14,  22,  19, ...,   0,   0,   0],
       [104,   5,   5, ...,   0,   0,   0],
       ..., 
       [ 14, 324,   5, ...,   0,   0,   0],
       [ 14, 178,  77, ...,   0,   0,   0],
       [ 14,  22,  19, ...,   0,   0,   0]])

In [12]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [13]:
"""Step 3: shuffle the train set and split the train set into train and dev sets"""
shuffle_indices = np.random.permutation(np.arange(len(y_)))
x_shuffled = x_[shuffle_indices]
y_shuffled = y_[shuffle_indices]
x_train, x_dev, y_train, y_dev = train_test_split(x_shuffled, y_shuffled, test_size=0.1)

In [14]:
"""Step 4: save the labels into labels.json since predict.py needs it"""
with open('./labels.json', 'w') as outfile:
    json.dump(labels, outfile, indent=4)

In [15]:
logging.info('x_train: {}, x_dev: {}, x_test: {}'.format(len(x_train), len(x_dev), len(x_test)))
logging.info('y_train: {}, y_dev: {}, y_test: {}'.format(len(y_train), len(y_dev), len(y_test)))

INFO:root:x_train: 54112, x_dev: 6013, x_test: 6681
INFO:root:y_train: 54112, y_dev: 6013, y_test: 6681


In [16]:
"""Step 5: build a graph and cnn object"""
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=y_train.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=params['embedding_dim'],
            filter_sizes=list(map(int, params['filter_sizes'].split(","))),
            num_filters=params['num_filters'],
            l2_reg_lambda=params['l2_reg_lambda'])

        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "trained_model_" + timestamp))

        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.all_variables())

        # One training step: train the model with one batch
        def train_step(x_batch, y_batch):
            feed_dict = {
                cnn.input_x: x_batch,
                cnn.input_y: y_batch,
                cnn.dropout_keep_prob: params['dropout_keep_prob']}
            _, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)

        # One evaluation step: evaluate the model with one batch
        def dev_step(x_batch, y_batch):
            feed_dict = {cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0}
            step, loss, acc, num_correct = sess.run([global_step, cnn.loss, cnn.accuracy, cnn.num_correct], feed_dict)
            return num_correct

        # Save the word_to_id map since predict.py needs it
        vocab_processor.save(os.path.join(out_dir, "vocab.pickle"))
        sess.run(tf.initialize_all_variables())

        # Training starts here
        train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), params['batch_size'], params['num_epochs'])
        best_accuracy, best_at_step = 0, 0

        """Step 6: train the cnn model with x_train and y_train (batch by batch)"""
        for train_batch in train_batches:
            x_train_batch, y_train_batch = zip(*train_batch)
            train_step(x_train_batch, y_train_batch)
            current_step = tf.train.global_step(sess, global_step)

            """Step 6.1: evaluate the model with x_dev and y_dev (batch by batch)"""
            if current_step % params['evaluate_every'] == 0:
                dev_batches = data_helper.batch_iter(list(zip(x_dev, y_dev)), params['batch_size'], 1)
                total_dev_correct = 0
                for dev_batch in dev_batches:
                    x_dev_batch, y_dev_batch = zip(*dev_batch)
                    num_dev_correct = dev_step(x_dev_batch, y_dev_batch)
                    total_dev_correct += num_dev_correct

                dev_accuracy = float(total_dev_correct) / len(y_dev)
                logging.critical('Accuracy on dev set: {}'.format(dev_accuracy))

                """Step 6.2: save the model if it is the best based on accuracy of the dev set"""
                if dev_accuracy >= best_accuracy:
                    best_accuracy, best_at_step = dev_accuracy, current_step
                    path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                    logging.critical('Saved model {} at step {}'.format(path, best_at_step))
                    logging.critical('Best accuracy {} at step {}'.format(best_accuracy, best_at_step))

        """Step 7: predict x_test (batch by batch)"""
        test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1)
        total_test_correct = 0
        for test_batch in test_batches:
            x_test_batch, y_test_batch = zip(*test_batch)
            num_test_correct = dev_step(x_test_batch, y_test_batch)
            total_test_correct += num_test_correct

        test_accuracy = float(total_test_correct) / len(y_test)
        logging.critical('Accuracy on test set is {} based on the best model {}'.format(test_accuracy, path))
        logging.critical('The training is complete')

Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Please use tf.global_variables instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.


Instructions for updating:
Use `tf.global_variables_initializer` instead.
CRITICAL:root:Accuracy on dev set: 0.481623149842
CRITICAL:root:Saved model /Users/zhujinpeng/workspace/wildml/Lightning_Talk_2017/low_lvl_api/multi-class-text-classification-cnn/trained_model_1510620378/checkpoints/model-200 at step 200
CRITICAL:root:Best accuracy 0.481623149842 at step 200


KeyboardInterrupt: 